3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 # stop MDS to forget last precreated object
1339 echo "stop $SINGLEMDS"
1340 stop $SINGLEMDS > /dev/null || error "(11) Fail to stop MDS!"
1341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1342 echo "start $SINGLEMDS"
1343 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1344 error "(12) Fail to start MDS!"
1346 #define OBD_FAIL_OST_ENOSPC 0x215
1347 do_facet ost1 $LCTL set_param fail_loc=0x215
1349 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1350 error "(2) Fail to start ost1"
1352 for ((i = 0; i < 60; i++)); do
1353 lastid2=$(do_facet ost1 "lctl get_param -n \
1354 obdfilter.${ost1_svc}.last_id" | grep $seq |
1355 awk -F: '{ print $2 }')
1356 [ ! -z $lastid2 ] && break;
1360 echo "the on-disk LAST_ID should be smaller than the expected one"
1361 [ $lastid1 -gt $lastid2 ] ||
1362 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1364 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1365 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1367 wait_update_facet ost1 "$LCTL get_param -n \
1368 obdfilter.${OST_DEV}.lfsck_layout |
1369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1371 error "(6) unexpected status"
1374 stop ost1 || error "(7) Fail to stop ost1"
1376 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1377 error "(8) Fail to start ost1"
1379 echo "the on-disk LAST_ID should have been rebuilt"
1380 wait_update_facet ost1 "$LCTL get_param -n \
1381 obdfilter.${ost1_svc}.last_id | grep $seq |
1382 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1383 do_facet ost1 $LCTL get_param -n \
1384 obdfilter.${ost1_svc}.last_id
1385 error "(9) expect lastid1 $seq:$lastid1"
1388 do_facet ost1 $LCTL set_param fail_loc=0
1389 stopall || error "(10) Fail to stopall"
1391 run_test 11b "LFSCK can rebuild crashed last_id"
1394 [ $MDSCOUNT -lt 2 ] &&
1395 skip "We need at least 2 MDSes for test_12a" && return
1397 check_mount_and_prep
1398 for k in $(seq $MDSCOUNT); do
1399 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1400 createmany -o $DIR/$tdir/${k}/f 100 ||
1401 error "(0) Fail to create 100 files."
1404 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1405 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1406 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1408 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1409 wait_all_targets namespace scanning-phase1 3
1411 echo "Stop namespace LFSCK on all targets by single lctl command."
1412 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1413 error "(4) Fail to stop LFSCK on all devices!"
1415 echo "All the LFSCK targets should be in 'stopped' status."
1416 wait_all_targets_blocked namespace stopped 5
1418 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1419 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1420 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1422 echo "All the LFSCK targets should be in 'completed' status."
1423 wait_all_targets_blocked namespace completed 7
1425 start_full_debug_logging
1427 echo "Start layout LFSCK on all targets by single command (-s 1)."
1428 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1429 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1431 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1432 wait_all_targets layout scanning-phase1 9
1434 echo "Stop layout LFSCK on all targets by single lctl command."
1435 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1436 error "(10) Fail to stop LFSCK on all devices!"
1438 echo "All the LFSCK targets should be in 'stopped' status."
1439 wait_all_targets_blocked layout stopped 11
1441 for k in $(seq $OSTCOUNT); do
1442 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1443 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1444 awk '/^status/ { print $2 }')
1445 [ "$STATUS" == "stopped" ] ||
1446 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1449 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1450 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1451 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1453 echo "All the LFSCK targets should be in 'completed' status."
1454 wait_all_targets_blocked layout completed 14
1456 stop_full_debug_logging
1458 run_test 12a "single command to trigger LFSCK on all devices"
1461 check_mount_and_prep
1463 echo "Start LFSCK without '-M' specified."
1464 do_facet mds1 $LCTL lfsck_start -A -r ||
1465 error "(0) Fail to start LFSCK without '-M'"
1467 wait_all_targets_blocked namespace completed 1
1468 wait_all_targets_blocked layout completed 2
1470 local count=$(do_facet mds1 $LCTL dl |
1471 awk '{ print $3 }' | grep mdt | wc -l)
1472 if [ $count -gt 1 ]; then
1474 echo "Start layout LFSCK on the node with multipe targets,"
1475 echo "but not specify '-M'/'-A' option. Should get failure."
1477 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1478 error "(3) Start layout LFSCK should fail" || true
1481 run_test 12b "auto detect Lustre device"
1485 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1486 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1487 echo "MDT-object FID."
1490 check_mount_and_prep
1492 echo "Inject failure stub to simulate bad lmm_oi"
1493 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1494 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1495 createmany -o $DIR/$tdir/f 1
1496 $LFS setstripe -E 1M -E -1 $DIR/$tdir/f1 ||
1497 error "(0) Fail to create PFL $DIR/$tdir/f1"
1498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1500 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1501 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1503 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1504 mdd.${MDT_DEV}.lfsck_layout |
1505 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1507 error "(2) unexpected status"
1510 local repaired=$($SHOW_LAYOUT |
1511 awk '/^repaired_others/ { print $2 }')
1512 [ $repaired -eq 2 ] ||
1513 error "(3) Fail to repair crashed lmm_oi: $repaired"
1515 run_test 13 "LFSCK can repair crashed lmm_oi"
1519 echo "The OST-object referenced by the MDT-object should be there;"
1520 echo "otherwise, the LFSCK should re-create the missing OST-object."
1521 echo "without '--delay-create-ostobj' option."
1524 check_mount_and_prep
1525 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1527 echo "Inject failure stub to simulate dangling referenced MDT-object"
1528 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1529 do_facet ost1 $LCTL set_param fail_loc=0x1610
1530 local count=$(precreated_ost_obj_count 0 0)
1532 createmany -o $DIR/$tdir/f $((count + 16)) ||
1533 error "(0.1) Fail to create $DIR/$tdir/fx"
1534 touch $DIR/$tdir/guard0
1536 for ((i = 0; i < 16; i++)); do
1537 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1538 $DIR/$tdir/f_comp${i} ||
1539 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1541 touch $DIR/$tdir/guard1
1543 do_facet ost1 $LCTL set_param fail_loc=0
1545 start_full_debug_logging
1547 # exhaust other pre-created dangling cases
1548 count=$(precreated_ost_obj_count 0 0)
1549 createmany -o $DIR/$tdir/a $count ||
1550 error "(0.5) Fail to create $count files."
1552 echo "'ls' should fail because of dangling referenced MDT-object"
1553 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1555 echo "Trigger layout LFSCK to find out dangling reference"
1556 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1558 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1559 mdd.${MDT_DEV}.lfsck_layout |
1560 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1562 error "(3) unexpected status"
1565 local repaired=$($SHOW_LAYOUT |
1566 awk '/^repaired_dangling/ { print $2 }')
1567 [ $repaired -ge 32 ] ||
1568 error "(4) Fail to repair dangling reference: $repaired"
1570 echo "'stat' should fail because of not repair dangling by default"
1571 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1572 error "(5.1) stat should fail"
1573 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1574 error "(5.2) stat should fail"
1576 echo "Trigger layout LFSCK to repair dangling reference"
1577 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1579 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1580 mdd.${MDT_DEV}.lfsck_layout |
1581 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1583 error "(7) unexpected status"
1586 # There may be some async LFSCK updates in processing, wait for
1587 # a while until the target reparation has been done. LU-4970.
1589 echo "'stat' should success after layout LFSCK repairing"
1590 wait_update_facet client "stat $DIR/$tdir/guard0 |
1591 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1592 stat $DIR/$tdir/guard0
1594 error "(8.1) unexpected size"
1597 wait_update_facet client "stat $DIR/$tdir/guard1 |
1598 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1599 stat $DIR/$tdir/guard1
1601 error "(8.2) unexpected size"
1604 repaired=$($SHOW_LAYOUT |
1605 awk '/^repaired_dangling/ { print $2 }')
1606 [ $repaired -ge 32 ] ||
1607 error "(9) Fail to repair dangling reference: $repaired"
1609 stop_full_debug_logging
1611 echo "stopall to cleanup object cache"
1614 setupall > /dev/null
1616 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1620 echo "The OST-object referenced by the MDT-object should be there;"
1621 echo "otherwise, the LFSCK should re-create the missing OST-object."
1622 echo "with '--delay-create-ostobj' option."
1625 check_mount_and_prep
1626 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1628 echo "Inject failure stub to simulate dangling referenced MDT-object"
1629 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1630 do_facet ost1 $LCTL set_param fail_loc=0x1610
1631 local count=$(precreated_ost_obj_count 0 0)
1633 createmany -o $DIR/$tdir/f $((count + 31))
1634 touch $DIR/$tdir/guard
1635 do_facet ost1 $LCTL set_param fail_loc=0
1637 start_full_debug_logging
1639 # exhaust other pre-created dangling cases
1640 count=$(precreated_ost_obj_count 0 0)
1641 createmany -o $DIR/$tdir/a $count ||
1642 error "(0) Fail to create $count files."
1644 echo "'ls' should fail because of dangling referenced MDT-object"
1645 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1647 echo "Trigger layout LFSCK to find out dangling reference"
1648 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1650 wait_all_targets_blocked layout completed 3
1652 local repaired=$($SHOW_LAYOUT |
1653 awk '/^repaired_dangling/ { print $2 }')
1654 [ $repaired -ge 32 ] ||
1655 error "(4) Fail to repair dangling reference: $repaired"
1657 echo "'stat' should fail because of not repair dangling by default"
1658 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1660 echo "Trigger layout LFSCK to repair dangling reference"
1661 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1663 wait_all_targets_blocked layout completed 7
1665 # There may be some async LFSCK updates in processing, wait for
1666 # a while until the target reparation has been done. LU-4970.
1668 echo "'stat' should success after layout LFSCK repairing"
1669 wait_update_facet client "stat $DIR/$tdir/guard |
1670 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1671 stat $DIR/$tdir/guard
1673 error "(8) unexpected size"
1676 repaired=$($SHOW_LAYOUT |
1677 awk '/^repaired_dangling/ { print $2 }')
1678 [ $repaired -ge 32 ] ||
1679 error "(9) Fail to repair dangling reference: $repaired"
1681 stop_full_debug_logging
1683 echo "stopall to cleanup object cache"
1686 setupall > /dev/null
1688 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1692 echo "If the OST-object referenced by the MDT-object back points"
1693 echo "to some non-exist MDT-object, then the LFSCK should repair"
1694 echo "the OST-object to back point to the right MDT-object."
1697 check_mount_and_prep
1698 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1700 echo "Inject failure stub to make the OST-object to back point to"
1701 echo "non-exist MDT-object."
1702 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1704 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1705 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1706 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1708 error "(0) Fail to create PFL $DIR/$tdir/f1"
1709 # 'dd' will trigger punch RPC firstly on every OST-objects.
1710 # So even though some OST-object will not be write by 'dd',
1711 # as long as it is allocated (may be NOT allocated in pfl_3b)
1712 # its layout information will be set also.
1713 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1714 cancel_lru_locks osc
1715 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1717 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1718 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1721 mdd.${MDT_DEV}.lfsck_layout |
1722 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1724 error "(2) unexpected status"
1727 local repaired=$($SHOW_LAYOUT |
1728 awk '/^repaired_unmatched_pair/ { print $2 }')
1729 [ $repaired -ge 3 ] ||
1730 error "(3) Fail to repair unmatched pair: $repaired"
1732 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1736 echo "If the OST-object referenced by the MDT-object back points"
1737 echo "to other MDT-object that doesn't recognize the OST-object,"
1738 echo "then the LFSCK should repair it to back point to the right"
1739 echo "MDT-object (the first one)."
1742 check_mount_and_prep
1743 mkdir -p $DIR/$tdir/0
1744 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1745 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1746 cancel_lru_locks osc
1748 echo "Inject failure stub to make the OST-object to back point to"
1749 echo "other MDT-object"
1752 [ $OSTCOUNT -ge 2 ] && stripes=2
1754 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1755 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1756 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1757 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1759 error "(0) Fail to create PFL $DIR/$tdir/f1"
1760 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1761 cancel_lru_locks osc
1762 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1764 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1765 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1767 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1768 mdd.${MDT_DEV}.lfsck_layout |
1769 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1771 error "(2) unexpected status"
1774 local repaired=$($SHOW_LAYOUT |
1775 awk '/^repaired_unmatched_pair/ { print $2 }')
1776 [ $repaired -eq 4 ] ||
1777 error "(3) Fail to repair unmatched pair: $repaired"
1779 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1782 [ $MDSCOUNT -lt 2 ] &&
1783 skip "We need at least 2 MDSes for this test" && return
1785 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1786 skip "Skip the test after 2.7.55 see LU-6437" && return
1789 echo "According to current metadata migration implementation,"
1790 echo "before the old MDT-object is removed, both the new MDT-object"
1791 echo "and old MDT-object will reference the same LOV layout. Then if"
1792 echo "the layout LFSCK finds the new MDT-object by race, it will"
1793 echo "regard related OST-object(s) as multiple referenced case, and"
1794 echo "will try to create new OST-object(s) for the new MDT-object."
1795 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1796 echo "MDT-object before confirm the multiple referenced case."
1799 check_mount_and_prep
1800 $LFS mkdir -i 1 $DIR/$tdir/a1
1801 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1802 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1803 cancel_lru_locks osc
1805 echo "Inject failure stub on MDT1 to delay the migration"
1807 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1808 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1809 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1810 $LFS migrate -m 0 $DIR/$tdir/a1 &
1813 echo "Trigger layout LFSCK to race with the migration"
1814 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1816 wait_all_targets_blocked layout completed 2
1818 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1819 local repaired=$($SHOW_LAYOUT |
1820 awk '/^repaired_unmatched_pair/ { print $2 }')
1821 [ $repaired -eq 1 ] ||
1822 error "(3) Fail to repair unmatched pair: $repaired"
1824 repaired=$($SHOW_LAYOUT |
1825 awk '/^repaired_multiple_referenced/ { print $2 }')
1826 [ $repaired -eq 0 ] ||
1827 error "(4) Unexpectedly repaird multiple references: $repaired"
1829 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1833 echo "If the OST-object's owner information does not match the owner"
1834 echo "information stored in the MDT-object, then the LFSCK trust the"
1835 echo "MDT-object and update the OST-object's owner information."
1838 check_mount_and_prep
1839 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1840 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1841 cancel_lru_locks osc
1843 echo "Inject failure stub to skip OST-object owner changing"
1844 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1846 chown 1.1 $DIR/$tdir/f0
1847 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1849 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1852 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1854 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1855 mdd.${MDT_DEV}.lfsck_layout |
1856 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1858 error "(2) unexpected status"
1861 local repaired=$($SHOW_LAYOUT |
1862 awk '/^repaired_inconsistent_owner/ { print $2 }')
1863 [ $repaired -eq 1 ] ||
1864 error "(3) Fail to repair inconsistent owner: $repaired"
1866 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1870 echo "If more than one MDT-objects reference the same OST-object,"
1871 echo "and the OST-object only recognizes one MDT-object, then the"
1872 echo "LFSCK should create new OST-objects for such non-recognized"
1876 check_mount_and_prep
1877 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1879 echo "Inject failure stub to make two MDT-objects to refernce"
1880 echo "the OST-object"
1882 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1883 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1884 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1885 cancel_lru_locks mdc
1886 cancel_lru_locks osc
1888 createmany -o $DIR/$tdir/f 1
1889 cancel_lru_locks mdc
1890 cancel_lru_locks osc
1892 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1894 error "(0) Fail to create PFL $DIR/$tdir/f1"
1895 cancel_lru_locks mdc
1896 cancel_lru_locks osc
1897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1899 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1900 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1901 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1902 [ $size -eq 1048576 ] ||
1903 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1905 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1906 [ $size -eq 1048576 ] ||
1907 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1909 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1912 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1914 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1915 mdd.${MDT_DEV}.lfsck_layout |
1916 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1918 error "(3) unexpected status"
1921 local repaired=$($SHOW_LAYOUT |
1922 awk '/^repaired_multiple_referenced/ { print $2 }')
1923 [ $repaired -eq 2 ] ||
1924 error "(4) Fail to repair multiple references: $repaired"
1926 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1927 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1928 error "(5) Fail to write f0."
1929 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1930 [ $size -eq 1048576 ] ||
1931 error "(6) guard size should be 1048576, but got $size"
1933 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1934 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1935 error "(7) Fail to write f1."
1936 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1937 [ $size -eq 1048576 ] ||
1938 error "(8) guard size should be 1048576, but got $size"
1940 run_test 17 "LFSCK can repair multiple references"
1942 $LCTL set_param debug=+cache > /dev/null
1946 echo "The target MDT-object is there, but related stripe information"
1947 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1948 echo "layout EA entries."
1951 check_mount_and_prep
1952 $LFS mkdir -i 0 $DIR/$tdir/a1
1953 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1954 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1956 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1958 $LFS path2fid $DIR/$tdir/a1/f1
1959 $LFS getstripe $DIR/$tdir/a1/f1
1961 if [ $MDSCOUNT -ge 2 ]; then
1962 $LFS mkdir -i 1 $DIR/$tdir/a2
1963 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1964 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1965 $LFS path2fid $DIR/$tdir/a2/f2
1966 $LFS getstripe $DIR/$tdir/a2/f2
1969 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
1970 error "(0) Fail to create PFL $DIR/$tdir/f3"
1972 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
1974 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
1976 $LFS path2fid $DIR/$tdir/f3
1977 $LFS getstripe $DIR/$tdir/f3
1979 cancel_lru_locks osc
1981 echo "Inject failure, to make the MDT-object lost its layout EA"
1982 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1983 do_facet mds1 $LCTL set_param fail_loc=0x1615
1984 chown 1.1 $DIR/$tdir/a1/f1
1986 if [ $MDSCOUNT -ge 2 ]; then
1987 do_facet mds2 $LCTL set_param fail_loc=0x1615
1988 chown 1.1 $DIR/$tdir/a2/f2
1991 chown 1.1 $DIR/$tdir/f3
1996 do_facet mds1 $LCTL set_param fail_loc=0
1997 if [ $MDSCOUNT -ge 2 ]; then
1998 do_facet mds2 $LCTL set_param fail_loc=0
2001 cancel_lru_locks mdc
2002 cancel_lru_locks osc
2004 echo "The file size should be incorrect since layout EA is lost"
2005 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2006 [ "$cur_size" != "$saved_size1" ] ||
2007 error "(1) Expect incorrect file1 size"
2009 if [ $MDSCOUNT -ge 2 ]; then
2010 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2011 [ "$cur_size" != "$saved_size1" ] ||
2012 error "(2) Expect incorrect file2 size"
2015 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2016 [ "$cur_size" != "$saved_size2" ] ||
2017 error "(1.2) Expect incorrect file3 size"
2019 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2020 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2022 for k in $(seq $MDSCOUNT); do
2023 # The LFSCK status query internal is 30 seconds. For the case
2024 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2025 # time to guarantee the status sync up.
2026 wait_update_facet mds${k} "$LCTL get_param -n \
2027 mdd.$(facet_svc mds${k}).lfsck_layout |
2028 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2029 error "(4) MDS${k} is not the expected 'completed'"
2032 for k in $(seq $OSTCOUNT); do
2033 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2034 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2035 awk '/^status/ { print $2 }')
2036 [ "$cur_status" == "completed" ] ||
2037 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2040 local repaired=$(do_facet mds1 $LCTL get_param -n \
2041 mdd.$(facet_svc mds1).lfsck_layout |
2042 awk '/^repaired_orphan/ { print $2 }')
2043 [ $repaired -eq 3 ] ||
2044 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2046 if [ $MDSCOUNT -ge 2 ]; then
2047 repaired=$(do_facet mds2 $LCTL get_param -n \
2048 mdd.$(facet_svc mds2).lfsck_layout |
2049 awk '/^repaired_orphan/ { print $2 }')
2050 [ $repaired -eq 2 ] ||
2051 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2054 $LFS path2fid $DIR/$tdir/a1/f1
2055 $LFS getstripe $DIR/$tdir/a1/f1
2057 if [ $MDSCOUNT -ge 2 ]; then
2058 $LFS path2fid $DIR/$tdir/a2/f2
2059 $LFS getstripe $DIR/$tdir/a2/f2
2062 $LFS path2fid $DIR/$tdir/f3
2063 $LFS getstripe $DIR/$tdir/f3
2065 echo "The file size should be correct after layout LFSCK scanning"
2066 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2067 [ "$cur_size" == "$saved_size1" ] ||
2068 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2070 if [ $MDSCOUNT -ge 2 ]; then
2071 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2072 [ "$cur_size" == "$saved_size1" ] ||
2073 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2076 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2077 [ "$cur_size" == "$saved_size2" ] ||
2078 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2080 run_test 18a "Find out orphan OST-object and repair it (1)"
2084 echo "The target MDT-object is lost. The LFSCK should re-create the"
2085 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2086 echo "can move it back to normal namespace manually."
2089 check_mount_and_prep
2090 $LFS mkdir -i 0 $DIR/$tdir/a1
2091 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2092 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2093 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2094 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2096 $LFS getstripe $DIR/$tdir/a1/f1
2098 if [ $MDSCOUNT -ge 2 ]; then
2099 $LFS mkdir -i 1 $DIR/$tdir/a2
2100 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2101 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2102 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2104 $LFS getstripe $DIR/$tdir/a2/f2
2107 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2108 error "(0) Fail to create PFL $DIR/$tdir/f3"
2110 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2112 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2113 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2115 $LFS getstripe $DIR/$tdir/f3
2117 cancel_lru_locks osc
2119 echo "Inject failure, to simulate the case of missing the MDT-object"
2120 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2121 do_facet mds1 $LCTL set_param fail_loc=0x1616
2122 rm -f $DIR/$tdir/a1/f1
2124 if [ $MDSCOUNT -ge 2 ]; then
2125 do_facet mds2 $LCTL set_param fail_loc=0x1616
2126 rm -f $DIR/$tdir/a2/f2
2134 do_facet mds1 $LCTL set_param fail_loc=0
2135 if [ $MDSCOUNT -ge 2 ]; then
2136 do_facet mds2 $LCTL set_param fail_loc=0
2139 cancel_lru_locks mdc
2140 cancel_lru_locks osc
2142 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2143 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2145 for k in $(seq $MDSCOUNT); do
2146 # The LFSCK status query internal is 30 seconds. For the case
2147 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2148 # time to guarantee the status sync up.
2149 wait_update_facet mds${k} "$LCTL get_param -n \
2150 mdd.$(facet_svc mds${k}).lfsck_layout |
2151 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2152 error "(2) MDS${k} is not the expected 'completed'"
2155 for k in $(seq $OSTCOUNT); do
2156 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2157 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2158 awk '/^status/ { print $2 }')
2159 [ "$cur_status" == "completed" ] ||
2160 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2163 local repaired=$(do_facet mds1 $LCTL get_param -n \
2164 mdd.$(facet_svc mds1).lfsck_layout |
2165 awk '/^repaired_orphan/ { print $2 }')
2166 [ $repaired -eq 3 ] ||
2167 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2169 if [ $MDSCOUNT -ge 2 ]; then
2170 repaired=$(do_facet mds2 $LCTL get_param -n \
2171 mdd.$(facet_svc mds2).lfsck_layout |
2172 awk '/^repaired_orphan/ { print $2 }')
2173 [ $repaired -eq 2 ] ||
2174 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2177 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2178 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2179 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2181 if [ $MDSCOUNT -ge 2 ]; then
2182 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2183 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2186 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2187 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2189 $LFS path2fid $DIR/$tdir/a1/f1
2190 $LFS getstripe $DIR/$tdir/a1/f1
2192 if [ $MDSCOUNT -ge 2 ]; then
2193 $LFS path2fid $DIR/$tdir/a2/f2
2194 $LFS getstripe $DIR/$tdir/a2/f2
2197 $LFS path2fid $DIR/$tdir/f3
2198 $LFS getstripe $DIR/$tdir/f3
2200 echo "The file size should be correct after layout LFSCK scanning"
2201 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2202 [ "$cur_size" == "$saved_size1" ] ||
2203 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2205 if [ $MDSCOUNT -ge 2 ]; then
2206 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2207 [ "$cur_size" == "$saved_size1" ] ||
2208 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2211 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2212 [ "$cur_size" == "$saved_size2" ] ||
2213 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2215 run_test 18b "Find out orphan OST-object and repair it (2)"
2219 echo "The target MDT-object is lost, and the OST-object FID is missing."
2220 echo "The LFSCK should re-create the MDT-object with new FID under the "
2221 echo "directory .lustre/lost+found/MDTxxxx."
2224 check_mount_and_prep
2225 $LFS mkdir -i 0 $DIR/$tdir/a1
2226 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2228 echo "Inject failure, to simulate the case of missing parent FID"
2229 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2230 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2232 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2233 $LFS getstripe $DIR/$tdir/a1/f1
2235 if [ $MDSCOUNT -ge 2 ]; then
2236 $LFS mkdir -i 1 $DIR/$tdir/a2
2237 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2238 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2239 $LFS getstripe $DIR/$tdir/a2/f2
2242 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2243 error "(0) Fail to create PFL $DIR/$tdir/f3"
2245 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2246 $LFS getstripe $DIR/$tdir/f3
2248 cancel_lru_locks osc
2249 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2251 echo "Inject failure, to simulate the case of missing the MDT-object"
2252 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2253 do_facet mds1 $LCTL set_param fail_loc=0x1616
2254 rm -f $DIR/$tdir/a1/f1
2256 if [ $MDSCOUNT -ge 2 ]; then
2257 do_facet mds2 $LCTL set_param fail_loc=0x1616
2258 rm -f $DIR/$tdir/a2/f2
2266 do_facet mds1 $LCTL set_param fail_loc=0
2267 if [ $MDSCOUNT -ge 2 ]; then
2268 do_facet mds2 $LCTL set_param fail_loc=0
2271 cancel_lru_locks mdc
2272 cancel_lru_locks osc
2274 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2275 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2277 for k in $(seq $MDSCOUNT); do
2278 # The LFSCK status query internal is 30 seconds. For the case
2279 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2280 # time to guarantee the status sync up.
2281 wait_update_facet mds${k} "$LCTL get_param -n \
2282 mdd.$(facet_svc mds${k}).lfsck_layout |
2283 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2284 error "(2) MDS${k} is not the expected 'completed'"
2287 for k in $(seq $OSTCOUNT); do
2288 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2289 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2290 awk '/^status/ { print $2 }')
2291 [ "$cur_status" == "completed" ] ||
2292 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2295 if [ $MDSCOUNT -ge 2 ]; then
2301 local repaired=$(do_facet mds1 $LCTL get_param -n \
2302 mdd.$(facet_svc mds1).lfsck_layout |
2303 awk '/^repaired_orphan/ { print $2 }')
2304 [ $repaired -eq $expected ] ||
2305 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2307 if [ $MDSCOUNT -ge 2 ]; then
2308 repaired=$(do_facet mds2 $LCTL get_param -n \
2309 mdd.$(facet_svc mds2).lfsck_layout |
2310 awk '/^repaired_orphan/ { print $2 }')
2311 [ $repaired -eq 0 ] ||
2312 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2315 ls -ail $MOUNT/.lustre/lost+found/
2317 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2318 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2319 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2321 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2324 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2325 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2326 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2328 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2329 [ ! -z "$cname" ] ||
2330 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2332 run_test 18c "Find out orphan OST-object and repair it (3)"
2336 echo "The target MDT-object layout EA is corrupted, but the right"
2337 echo "OST-object is still alive as orphan. The layout LFSCK will"
2338 echo "not create new OST-object to occupy such slot."
2341 check_mount_and_prep
2343 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2344 echo "guard" > $DIR/$tdir/a1/f1
2345 echo "foo" > $DIR/$tdir/a1/f2
2347 echo "guard" > $DIR/$tdir/a1/f3
2348 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2349 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2350 echo "foo" > $DIR/$tdir/a1/f4
2352 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2353 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2354 $LFS path2fid $DIR/$tdir/a1/f1
2355 $LFS getstripe $DIR/$tdir/a1/f1
2356 $LFS path2fid $DIR/$tdir/a1/f2
2357 $LFS getstripe $DIR/$tdir/a1/f2
2358 $LFS path2fid $DIR/$tdir/a1/f3
2359 $LFS getstripe $DIR/$tdir/a1/f3
2360 $LFS path2fid $DIR/$tdir/a1/f4
2361 $LFS getstripe $DIR/$tdir/a1/f4
2362 cancel_lru_locks osc
2364 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2365 echo "to reference the same OST-object (which is f1's OST-obejct)."
2366 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2367 echo "dangling reference case, but f2's old OST-object is there."
2369 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2370 echo "to reference the same OST-object (which is f3's OST-obejct)."
2371 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2372 echo "dangling reference case, but f4's old OST-object is there."
2375 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2376 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2377 chown 1.1 $DIR/$tdir/a1/f2
2378 chown 1.1 $DIR/$tdir/a1/f4
2379 rm -f $DIR/$tdir/a1/f1
2380 rm -f $DIR/$tdir/a1/f3
2383 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2385 echo "stopall to cleanup object cache"
2388 setupall > /dev/null
2390 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2391 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2393 for k in $(seq $MDSCOUNT); do
2394 # The LFSCK status query internal is 30 seconds. For the case
2395 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2396 # time to guarantee the status sync up.
2397 wait_update_facet mds${k} "$LCTL get_param -n \
2398 mdd.$(facet_svc mds${k}).lfsck_layout |
2399 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2400 error "(3) MDS${k} is not the expected 'completed'"
2403 for k in $(seq $OSTCOUNT); do
2404 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2405 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2406 awk '/^status/ { print $2 }')
2407 [ "$cur_status" == "completed" ] ||
2408 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2411 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2412 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2413 awk '/^repaired_orphan/ { print $2 }')
2414 [ $repaired -eq 2 ] ||
2415 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2417 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2418 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2419 awk '/^repaired_dangling/ { print $2 }')
2420 [ $repaired -eq 0 ] ||
2421 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2423 echo "The file size should be correct after layout LFSCK scanning"
2424 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2425 [ "$cur_size" == "$saved_size1" ] ||
2426 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2428 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2429 [ "$cur_size" == "$saved_size2" ] ||
2430 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2432 echo "The LFSCK should find back the original data."
2433 cat $DIR/$tdir/a1/f2
2434 $LFS path2fid $DIR/$tdir/a1/f2
2435 $LFS getstripe $DIR/$tdir/a1/f2
2436 cat $DIR/$tdir/a1/f4
2437 $LFS path2fid $DIR/$tdir/a1/f4
2438 $LFS getstripe $DIR/$tdir/a1/f4
2440 run_test 18d "Find out orphan OST-object and repair it (4)"
2444 echo "The target MDT-object layout EA slot is occpuied by some new"
2445 echo "created OST-object when repair dangling reference case. Such"
2446 echo "conflict OST-object has been modified by others. To keep the"
2447 echo "new data, the LFSCK will create a new file to refernece this"
2448 echo "old orphan OST-object."
2451 check_mount_and_prep
2453 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2454 echo "guard" > $DIR/$tdir/a1/f1
2455 echo "foo" > $DIR/$tdir/a1/f2
2457 echo "guard" > $DIR/$tdir/a1/f3
2458 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2459 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2460 echo "foo" > $DIR/$tdir/a1/f4
2462 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2463 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2465 $LFS path2fid $DIR/$tdir/a1/f1
2466 $LFS getstripe $DIR/$tdir/a1/f1
2467 $LFS path2fid $DIR/$tdir/a1/f2
2468 $LFS getstripe $DIR/$tdir/a1/f2
2469 $LFS path2fid $DIR/$tdir/a1/f3
2470 $LFS getstripe $DIR/$tdir/a1/f3
2471 $LFS path2fid $DIR/$tdir/a1/f4
2472 $LFS getstripe $DIR/$tdir/a1/f4
2473 cancel_lru_locks osc
2475 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2476 echo "to reference the same OST-object (which is f1's OST-obejct)."
2477 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2478 echo "dangling reference case, but f2's old OST-object is there."
2480 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2481 echo "to reference the same OST-object (which is f3's OST-obejct)."
2482 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2483 echo "dangling reference case, but f4's old OST-object is there."
2486 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2487 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2488 chown 1.1 $DIR/$tdir/a1/f2
2489 chown 1.1 $DIR/$tdir/a1/f4
2490 rm -f $DIR/$tdir/a1/f1
2491 rm -f $DIR/$tdir/a1/f3
2494 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2496 echo "stopall to cleanup object cache"
2499 setupall > /dev/null
2501 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2502 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2504 start_full_debug_logging
2506 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2507 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2509 wait_update_facet mds1 "$LCTL get_param -n \
2510 mdd.$(facet_svc mds1).lfsck_layout |
2511 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2512 error "(3) MDS1 is not the expected 'scanning-phase2'"
2514 # to guarantee all updates are synced.
2518 echo "Write new data to f2/f4 to modify the new created OST-object."
2519 echo "dummy" >> $DIR/$tdir/a1/f2
2520 echo "dummy" >> $DIR/$tdir/a1/f4
2522 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2524 for k in $(seq $MDSCOUNT); do
2525 # The LFSCK status query internal is 30 seconds. For the case
2526 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2527 # time to guarantee the status sync up.
2528 wait_update_facet mds${k} "$LCTL get_param -n \
2529 mdd.$(facet_svc mds${k}).lfsck_layout |
2530 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2531 error "(4) MDS${k} is not the expected 'completed'"
2534 for k in $(seq $OSTCOUNT); do
2535 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2536 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2537 awk '/^status/ { print $2 }')
2538 [ "$cur_status" == "completed" ] ||
2539 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2542 stop_full_debug_logging
2544 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2545 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2546 awk '/^repaired_orphan/ { print $2 }')
2547 [ $repaired -eq 2 ] ||
2548 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2550 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2551 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2552 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2554 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2555 if [ $count -ne 2 ]; then
2556 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2557 error "(8) Expect 2 stubs under lost+found, but got $count"
2560 echo "The stub file should keep the original f2 or f4 data"
2561 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2562 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2563 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2564 error "(9) Got unexpected $cur_size"
2567 $LFS path2fid $cname
2568 $LFS getstripe $cname
2570 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2571 cur_size=$(ls -il $cname | awk '{ print $6 }')
2572 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2573 error "(10) Got unexpected $cur_size"
2576 $LFS path2fid $cname
2577 $LFS getstripe $cname
2579 echo "The f2/f4 should contains new data."
2580 cat $DIR/$tdir/a1/f2
2581 $LFS path2fid $DIR/$tdir/a1/f2
2582 $LFS getstripe $DIR/$tdir/a1/f2
2583 cat $DIR/$tdir/a1/f4
2584 $LFS path2fid $DIR/$tdir/a1/f4
2585 $LFS getstripe $DIR/$tdir/a1/f4
2587 run_test 18e "Find out orphan OST-object and repair it (5)"
2590 [ $OSTCOUNT -lt 2 ] &&
2591 skip "The test needs at least 2 OSTs" && return
2594 echo "The target MDT-object is lost. The LFSCK should re-create the"
2595 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2596 echo "to verify some OST-object(s) during the first stage-scanning,"
2597 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2598 echo "should not be affected."
2601 check_mount_and_prep
2602 $LFS mkdir -i 0 $DIR/$tdir/a1
2603 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2604 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2605 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2606 $LFS mkdir -i 0 $DIR/$tdir/a2
2607 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2608 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2609 $LFS getstripe $DIR/$tdir/a1/f1
2610 $LFS getstripe $DIR/$tdir/a2/f2
2612 if [ $MDSCOUNT -ge 2 ]; then
2613 $LFS mkdir -i 1 $DIR/$tdir/a3
2614 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2615 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2616 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2617 $LFS mkdir -i 1 $DIR/$tdir/a4
2618 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2619 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2620 $LFS getstripe $DIR/$tdir/a3/f3
2621 $LFS getstripe $DIR/$tdir/a4/f4
2624 cancel_lru_locks osc
2626 echo "Inject failure, to simulate the case of missing the MDT-object"
2627 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2628 do_facet mds1 $LCTL set_param fail_loc=0x1616
2629 rm -f $DIR/$tdir/a1/f1
2630 rm -f $DIR/$tdir/a2/f2
2632 if [ $MDSCOUNT -ge 2 ]; then
2633 do_facet mds2 $LCTL set_param fail_loc=0x1616
2634 rm -f $DIR/$tdir/a3/f3
2635 rm -f $DIR/$tdir/a4/f4
2641 do_facet mds1 $LCTL set_param fail_loc=0
2642 if [ $MDSCOUNT -ge 2 ]; then
2643 do_facet mds2 $LCTL set_param fail_loc=0
2646 cancel_lru_locks mdc
2647 cancel_lru_locks osc
2649 echo "Inject failure, to simulate the OST0 fail to handle"
2650 echo "MDT0 LFSCK request during the first-stage scanning."
2651 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2652 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2654 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2655 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2657 for k in $(seq $MDSCOUNT); do
2658 # The LFSCK status query internal is 30 seconds. For the case
2659 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2660 # time to guarantee the status sync up.
2661 wait_update_facet mds${k} "$LCTL get_param -n \
2662 mdd.$(facet_svc mds${k}).lfsck_layout |
2663 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2664 error "(2) MDS${k} is not the expected 'partial'"
2667 wait_update_facet ost1 "$LCTL get_param -n \
2668 obdfilter.$(facet_svc ost1).lfsck_layout |
2669 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2670 error "(3) OST1 is not the expected 'partial'"
2673 wait_update_facet ost2 "$LCTL get_param -n \
2674 obdfilter.$(facet_svc ost2).lfsck_layout |
2675 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2676 error "(4) OST2 is not the expected 'completed'"
2679 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2681 local repaired=$(do_facet mds1 $LCTL get_param -n \
2682 mdd.$(facet_svc mds1).lfsck_layout |
2683 awk '/^repaired_orphan/ { print $2 }')
2684 [ $repaired -eq 1 ] ||
2685 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2687 if [ $MDSCOUNT -ge 2 ]; then
2688 repaired=$(do_facet mds2 $LCTL get_param -n \
2689 mdd.$(facet_svc mds2).lfsck_layout |
2690 awk '/^repaired_orphan/ { print $2 }')
2691 [ $repaired -eq 1 ] ||
2692 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2695 echo "Trigger layout LFSCK on all devices again to cleanup"
2696 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2698 for k in $(seq $MDSCOUNT); do
2699 # The LFSCK status query internal is 30 seconds. For the case
2700 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2701 # time to guarantee the status sync up.
2702 wait_update_facet mds${k} "$LCTL get_param -n \
2703 mdd.$(facet_svc mds${k}).lfsck_layout |
2704 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2705 error "(8) MDS${k} is not the expected 'completed'"
2708 for k in $(seq $OSTCOUNT); do
2709 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2710 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2711 awk '/^status/ { print $2 }')
2712 [ "$cur_status" == "completed" ] ||
2713 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2717 local repaired=$(do_facet mds1 $LCTL get_param -n \
2718 mdd.$(facet_svc mds1).lfsck_layout |
2719 awk '/^repaired_orphan/ { print $2 }')
2720 [ $repaired -eq 2 ] ||
2721 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2723 if [ $MDSCOUNT -ge 2 ]; then
2724 repaired=$(do_facet mds2 $LCTL get_param -n \
2725 mdd.$(facet_svc mds2).lfsck_layout |
2726 awk '/^repaired_orphan/ { print $2 }')
2727 [ $repaired -eq 2 ] ||
2728 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2731 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2735 echo "The target MDT-object is lost, but related OI mapping is there"
2736 echo "The LFSCK should recreate the lost MDT-object without affected"
2737 echo "by the stale OI mapping."
2740 check_mount_and_prep
2741 $LFS mkdir -i 0 $DIR/$tdir/a1
2742 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2743 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2744 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2746 $LFS getstripe $DIR/$tdir/a1/f1
2747 cancel_lru_locks osc
2749 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2750 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2751 do_facet mds1 $LCTL set_param fail_loc=0x162e
2752 rm -f $DIR/$tdir/a1/f1
2754 do_facet mds1 $LCTL set_param fail_loc=0
2755 cancel_lru_locks mdc
2756 cancel_lru_locks osc
2758 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2759 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2761 for k in $(seq $MDSCOUNT); do
2762 # The LFSCK status query internal is 30 seconds. For the case
2763 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2764 # time to guarantee the status sync up.
2765 wait_update_facet mds${k} "$LCTL get_param -n \
2766 mdd.$(facet_svc mds${k}).lfsck_layout |
2767 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2768 error "(2) MDS${k} is not the expected 'completed'"
2771 for k in $(seq $OSTCOUNT); do
2772 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2773 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2774 awk '/^status/ { print $2 }')
2775 [ "$cur_status" == "completed" ] ||
2776 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2779 local repaired=$(do_facet mds1 $LCTL get_param -n \
2780 mdd.$(facet_svc mds1).lfsck_layout |
2781 awk '/^repaired_orphan/ { print $2 }')
2782 [ $repaired -eq $OSTCOUNT ] ||
2783 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2785 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2786 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2787 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2789 $LFS path2fid $DIR/$tdir/a1/f1
2790 $LFS getstripe $DIR/$tdir/a1/f1
2792 run_test 18g "Find out orphan OST-object and repair it (7)"
2796 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2797 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2798 echo "scanning its OST-object(s). Then in the second stage scanning,"
2799 echo "the OST will return related OST-object(s) to the MDT as orphan."
2800 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2801 echo "the 'orphan(s)' stripe information."
2804 check_mount_and_prep
2806 $LFS setstripe -E 2M -c 1 -E -1 $DIR/$tdir/f0 ||
2807 error "(0) Fail to create PFL $DIR/$tdir/f0"
2809 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2810 error "(1.1) Fail to write $DIR/$tdir/f0"
2812 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2813 error "(1.2) Fail to write $DIR/$tdir/f0"
2815 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2817 echo "Inject failure stub to simulate bad PFL extent range"
2818 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2819 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2821 chown 1.1 $DIR/$tdir/f0
2823 cancel_lru_locks mdc
2824 cancel_lru_locks osc
2825 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2827 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2828 error "(2) Write to bad PFL file should fail"
2830 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2831 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2833 for k in $(seq $MDSCOUNT); do
2834 # The LFSCK status query internal is 30 seconds. For the case
2835 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2836 # time to guarantee the status sync up.
2837 wait_update_facet mds${k} "$LCTL get_param -n \
2838 mdd.$(facet_svc mds${k}).lfsck_layout |
2839 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2840 error "(4.1) MDS${k} is not the expected 'completed'"
2843 for k in $(seq $OSTCOUNT); do
2844 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2845 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2846 awk '/^status/ { print $2 }')
2847 [ "$cur_status" == "completed" ] ||
2848 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2852 local repaired=$($SHOW_LAYOUT |
2853 awk '/^repaired_orphan/ { print $2 }')
2854 [ $repaired -eq 2 ] ||
2855 error "(5) Fail to repair crashed PFL range: $repaired"
2857 echo "Data in $DIR/$tdir/f0 should not be broken"
2858 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2859 error "(6) Data in $DIR/$tdir/f0 is broken"
2861 echo "Write should succeed after LFSCK repairing the bad PFL range"
2862 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2863 error "(7) Write should succeed after LFSCK"
2865 run_test 18h "LFSCK can repair crashed PFL extent range"
2867 $LCTL set_param debug=-cache > /dev/null
2870 check_mount_and_prep
2871 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2873 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2874 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2876 echo "foo1" > $DIR/$tdir/a0
2877 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2878 error "(0) Fail to create PFL $DIR/$tdir/a1"
2879 echo "foo2" > $DIR/$tdir/a1
2880 echo "guard" > $DIR/$tdir/a2
2881 cancel_lru_locks osc
2883 echo "Inject failure, then client will offer wrong parent FID when read"
2884 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2885 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2887 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2888 $LCTL set_param fail_loc=0x1619
2890 echo "Read RPC with wrong parent FID should be denied"
2891 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2892 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2893 $LCTL set_param fail_loc=0
2895 run_test 19a "OST-object inconsistency self detect"
2898 check_mount_and_prep
2899 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2901 echo "Inject failure stub to make the OST-object to back point to"
2902 echo "non-exist MDT-object"
2904 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2905 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2907 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2908 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2909 echo "foo1" > $DIR/$tdir/f0
2910 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2911 error "(0) Fail to create PFL $DIR/$tdir/f1"
2912 echo "foo2" > $DIR/$tdir/f1
2913 cancel_lru_locks osc
2914 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2916 do_facet ost1 $LCTL set_param -n \
2917 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2918 echo "Nothing should be fixed since self detect and repair is disabled"
2919 local repaired=$(do_facet ost1 $LCTL get_param -n \
2920 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2921 awk '/^repaired/ { print $2 }')
2922 [ $repaired -eq 0 ] ||
2923 error "(1) Expected 0 repaired, but got $repaired"
2925 echo "Read RPC with right parent FID should be accepted,"
2926 echo "and cause parent FID on OST to be fixed"
2928 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2929 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2931 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2932 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2934 repaired=$(do_facet ost1 $LCTL get_param -n \
2935 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2936 awk '/^repaired/ { print $2 }')
2937 [ $repaired -eq 2 ] ||
2938 error "(3) Expected 1 repaired, but got $repaired"
2940 run_test 19b "OST-object inconsistency self repair"
2942 PATTERN_WITH_HOLE="40000001"
2943 PATTERN_WITHOUT_HOLE="1"
2946 [ $OSTCOUNT -lt 2 ] &&
2947 skip "The test needs at least 2 OSTs" && return
2950 echo "The target MDT-object and some of its OST-object are lost."
2951 echo "The LFSCK should find out the left OST-objects and re-create"
2952 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2953 echo "with the partial OST-objects (LOV EA hole)."
2955 echo "New client can access the file with LOV EA hole via normal"
2956 echo "system tools or commands without crash the system."
2958 echo "For old client, even though it cannot access the file with"
2959 echo "LOV EA hole, it should not cause the system crash."
2962 check_mount_and_prep
2963 $LFS mkdir -i 0 $DIR/$tdir/a1
2964 if [ $OSTCOUNT -gt 2 ]; then
2965 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2968 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2972 # 256 blocks on the stripe0.
2973 # 1 block on the stripe1 for 2 OSTs case.
2974 # 256 blocks on the stripe1 for other cases.
2975 # 1 block on the stripe2 if OSTs > 2
2976 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2977 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2978 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2980 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2981 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2982 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2985 $LFS getstripe $DIR/$tdir/a1/f0
2987 $LFS getstripe $DIR/$tdir/a1/f1
2989 $LFS getstripe $DIR/$tdir/a1/f2
2991 if [ $OSTCOUNT -gt 2 ]; then
2992 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2993 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2995 $LFS getstripe $DIR/$tdir/a1/f3
2998 cancel_lru_locks osc
3000 echo "Inject failure..."
3001 echo "To simulate f0 lost MDT-object"
3002 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3003 do_facet mds1 $LCTL set_param fail_loc=0x1616
3004 rm -f $DIR/$tdir/a1/f0
3006 echo "To simulate f1 lost MDT-object and OST-object0"
3007 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3008 do_facet mds1 $LCTL set_param fail_loc=0x161a
3009 rm -f $DIR/$tdir/a1/f1
3011 echo "To simulate f2 lost MDT-object and OST-object1"
3012 do_facet mds1 $LCTL set_param fail_val=1
3013 rm -f $DIR/$tdir/a1/f2
3015 if [ $OSTCOUNT -gt 2 ]; then
3016 echo "To simulate f3 lost MDT-object and OST-object2"
3017 do_facet mds1 $LCTL set_param fail_val=2
3018 rm -f $DIR/$tdir/a1/f3
3021 umount_client $MOUNT
3024 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3026 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3027 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3029 for k in $(seq $MDSCOUNT); do
3030 # The LFSCK status query internal is 30 seconds. For the case
3031 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3032 # time to guarantee the status sync up.
3033 wait_update_facet mds${k} "$LCTL get_param -n \
3034 mdd.$(facet_svc mds${k}).lfsck_layout |
3035 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3036 error "(2) MDS${k} is not the expected 'completed'"
3039 for k in $(seq $OSTCOUNT); do
3040 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3041 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3042 awk '/^status/ { print $2 }')
3043 [ "$cur_status" == "completed" ] ||
3044 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3047 local repaired=$(do_facet mds1 $LCTL get_param -n \
3048 mdd.$(facet_svc mds1).lfsck_layout |
3049 awk '/^repaired_orphan/ { print $2 }')
3050 if [ $OSTCOUNT -gt 2 ]; then
3051 [ $repaired -eq 9 ] ||
3052 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3054 [ $repaired -eq 4 ] ||
3055 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3058 mount_client $MOUNT || error "(5.0) Fail to start client!"
3060 LOV_PATTERN_F_HOLE=0x40000000
3063 # ${fid0}-R-0 is the old f0
3065 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3066 echo "Check $name, which is the old f0"
3068 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3070 local pattern=$($LFS getstripe -L $name)
3071 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3072 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3074 local stripes=$($LFS getstripe -c $name)
3075 if [ $OSTCOUNT -gt 2 ]; then
3076 [ $stripes -eq 3 ] ||
3077 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3079 [ $stripes -eq 2 ] ||
3080 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3083 local size=$(stat $name | awk '/Size:/ { print $2 }')
3084 [ $size -eq $((4096 * $bcount)) ] ||
3085 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3087 cat $name > /dev/null || error "(5.5) cannot read $name"
3089 echo "dummy" >> $name || error "(5.6) cannot write $name"
3091 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3093 touch $name || error "(5.8) cannot touch $name"
3095 rm -f $name || error "(5.9) cannot unlink $name"
3098 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3100 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3101 if [ $OSTCOUNT -gt 2 ]; then
3102 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3104 echo "Check $name, it contains the old f1's stripe1"
3107 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3109 pattern=$($LFS getstripe -L $name)
3110 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3111 error "(6.2) expect pattern flag hole, but got $pattern"
3113 stripes=$($LFS getstripe -c $name)
3114 if [ $OSTCOUNT -gt 2 ]; then
3115 [ $stripes -eq 3 ] ||
3116 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3118 [ $stripes -eq 2 ] ||
3119 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3122 size=$(stat $name | awk '/Size:/ { print $2 }')
3123 [ $size -eq $((4096 * $bcount)) ] ||
3124 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3126 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3128 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3129 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3132 [ $failures -eq 256 ] ||
3133 error "(6.6) expect 256 IO failures, but get $failures"
3135 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3136 [ $size -eq $((4096 * $bcount)) ] ||
3137 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3139 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3140 error "(6.8) write to the LOV EA hole should fail"
3142 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3143 error "(6.9) write to normal stripe should NOT fail"
3145 echo "foo" >> $name && error "(6.10) append write $name should fail"
3147 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3149 touch $name || error "(6.12) cannot touch $name"
3151 rm -f $name || error "(6.13) cannot unlink $name"
3154 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3156 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3157 if [ $OSTCOUNT -gt 2 ]; then
3158 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3160 echo "Check $name, it contains the old f2's stripe0"
3163 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3165 pattern=$($LFS getstripe -L $name)
3166 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3167 error "(7.2) expect pattern flag hole, but got $pattern"
3169 stripes=$($LFS getstripe -c $name)
3170 size=$(stat $name | awk '/Size:/ { print $2 }')
3171 if [ $OSTCOUNT -gt 2 ]; then
3172 [ $stripes -eq 3 ] ||
3173 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3175 [ $size -eq $((4096 * $bcount)) ] ||
3176 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3178 cat $name > /dev/null &&
3179 error "(7.5.1) normal read $name should fail"
3181 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3182 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3184 [ $failures -eq 256 ] ||
3185 error "(7.6) expect 256 IO failures, but get $failures"
3187 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3188 [ $size -eq $((4096 * $bcount)) ] ||
3189 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3191 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3192 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3194 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3195 error "(7.8.1) write to normal stripe should NOT fail"
3197 echo "foo" >> $name &&
3198 error "(7.8.3) append write $name should fail"
3200 chown $RUNAS_ID:$RUNAS_GID $name ||
3201 error "(7.9.1) cannot chown on $name"
3203 touch $name || error "(7.10.1) cannot touch $name"
3205 [ $stripes -eq 2 ] ||
3206 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3209 [ $size -eq $((4096 * (256 + 0))) ] ||
3210 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3212 cat $name > /dev/null &&
3213 error "(7.5.2) normal read $name should fail"
3215 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3216 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3217 [ $failures -eq 256 ] ||
3218 error "(7.6.2) expect 256 IO failures, but get $failures"
3221 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3222 [ $size -eq $((4096 * $bcount)) ] ||
3223 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3225 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3226 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3228 chown $RUNAS_ID:$RUNAS_GID $name ||
3229 error "(7.9.2) cannot chown on $name"
3231 touch $name || error "(7.10.2) cannot touch $name"
3234 rm -f $name || error "(7.11) cannot unlink $name"
3236 [ $OSTCOUNT -le 2 ] && return
3239 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3241 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3242 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3244 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3246 pattern=$($LFS getstripe -L $name)
3247 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3248 error "(8.2) expect pattern flag hole, but got $pattern"
3250 stripes=$($LFS getstripe -c $name)
3251 [ $stripes -eq 3 ] ||
3252 error "(8.3) expect the stripe count is 3, but got $stripes"
3254 size=$(stat $name | awk '/Size:/ { print $2 }')
3256 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3257 error "(8.4) expect the size $((4096 * 512)), but got $size"
3259 cat $name > /dev/null &&
3260 error "(8.5) normal read $name should fail"
3262 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3263 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3265 [ $failures -eq 256 ] ||
3266 error "(8.6) expect 256 IO failures, but get $failures"
3269 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3270 [ $size -eq $((4096 * $bcount)) ] ||
3271 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3273 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3274 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3276 chown $RUNAS_ID:$RUNAS_GID $name ||
3277 error "(8.9) cannot chown on $name"
3279 touch $name || error "(8.10) cannot touch $name"
3281 rm -f $name || error "(8.11) cannot unlink $name"
3283 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3286 [ $OSTCOUNT -lt 2 ] &&
3287 skip "The test needs at least 2 OSTs" && return
3290 echo "The target MDT-object and some of its OST-object are lost."
3291 echo "The LFSCK should find out the left OST-objects and re-create"
3292 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3293 echo "with the partial OST-objects (LOV EA hole)."
3295 echo "New client can access the file with LOV EA hole via normal"
3296 echo "system tools or commands without crash the system - PFL case."
3299 check_mount_and_prep
3301 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3302 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3303 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3304 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3305 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3306 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3308 local bcount=$((256 * 3 + 1))
3310 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3311 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3312 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3314 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3315 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3316 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3319 $LFS getstripe $DIR/$tdir/f0
3321 $LFS getstripe $DIR/$tdir/f1
3323 $LFS getstripe $DIR/$tdir/f2
3325 cancel_lru_locks mdc
3326 cancel_lru_locks osc
3328 echo "Inject failure..."
3329 echo "To simulate f0 lost MDT-object"
3330 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3334 echo "To simulate the case of f1 lost MDT-object and "
3335 echo "the first OST-object in each PFL component"
3336 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3340 echo "To simulate the case of f2 lost MDT-object and "
3341 echo "the second OST-object in each PFL component"
3342 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3349 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3350 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3352 for k in $(seq $MDSCOUNT); do
3353 # The LFSCK status query internal is 30 seconds. For the case
3354 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3355 # time to guarantee the status sync up.
3356 wait_update_facet mds${k} "$LCTL get_param -n \
3357 mdd.$(facet_svc mds${k}).lfsck_layout |
3358 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3359 error "(4) MDS${k} is not the expected 'completed'"
3362 for k in $(seq $OSTCOUNT); do
3363 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3364 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3365 awk '/^status/ { print $2 }')
3366 [ "$cur_status" == "completed" ] ||
3367 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3370 local repaired=$(do_facet mds1 $LCTL get_param -n \
3371 mdd.$(facet_svc mds1).lfsck_layout |
3372 awk '/^repaired_orphan/ { print $2 }')
3373 [ $repaired -eq 8 ] ||
3374 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3377 # ${fid0}-R-0 is the old f0
3379 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3380 echo "Check $name, which is the old f0"
3382 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3384 local pattern=$($LFS getstripe -L -I 1 $name)
3385 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3386 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3388 pattern=$($LFS getstripe -L -I 2 $name)
3389 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3390 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3392 local stripes=$($LFS getstripe -c -I 1 $name)
3393 [ $stripes -eq 2 ] ||
3394 error "(7.3.1) expect 2 stripes, but got $stripes"
3396 stripes=$($LFS getstripe -c -I 2 $name)
3397 [ $stripes -eq 2 ] ||
3398 error "(7.3.2) expect 2 stripes, but got $stripes"
3400 local e_start=$($LFS getstripe -I 1 $name |
3401 awk '/lcme_extent.e_start:/ { print $2 }')
3402 [ $e_start -eq 0 ] ||
3403 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3405 local e_end=$($LFS getstripe -I 1 $name |
3406 awk '/lcme_extent.e_end:/ { print $2 }')
3407 [ $e_end -eq 2097152 ] ||
3408 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3410 e_start=$($LFS getstripe -I 2 $name |
3411 awk '/lcme_extent.e_start:/ { print $2 }')
3412 [ $e_start -eq 2097152 ] ||
3413 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3415 e_end=$($LFS getstripe -I 2 $name |
3416 awk '/lcme_extent.e_end:/ { print $2 }')
3417 [ "$e_end" = "EOF" ] ||
3418 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3420 local size=$(stat $name | awk '/Size:/ { print $2 }')
3421 [ $size -eq $((4096 * $bcount)) ] ||
3422 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3424 cat $name > /dev/null || error "(7.7) cannot read $name"
3426 echo "dummy" >> $name || error "(7.8) cannot write $name"
3428 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3430 touch $name || error "(7.10) cannot touch $name"
3432 rm -f $name || error "(7.11) cannot unlink $name"
3435 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3437 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3438 echo "Check $name, it contains f1's second OST-object in each COMP"
3440 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3442 pattern=$($LFS getstripe -L -I 1 $name)
3443 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3444 error "(8.2.1) expect pattern flag hole, but got $pattern"
3446 pattern=$($LFS getstripe -L -I 2 $name)
3447 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3448 error "(8.2.2) expect pattern flag hole, but got $pattern"
3450 stripes=$($LFS getstripe -c -I 1 $name)
3451 [ $stripes -eq 2 ] ||
3452 error "(8.3.2) expect 2 stripes, but got $stripes"
3454 stripes=$($LFS getstripe -c -I 2 $name)
3455 [ $stripes -eq 2 ] ||
3456 error "(8.3.2) expect 2 stripes, but got $stripes"
3458 e_start=$($LFS getstripe -I 1 $name |
3459 awk '/lcme_extent.e_start:/ { print $2 }')
3460 [ $e_start -eq 0 ] ||
3461 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3463 e_end=$($LFS getstripe -I 1 $name |
3464 awk '/lcme_extent.e_end:/ { print $2 }')
3465 [ $e_end -eq 2097152 ] ||
3466 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3468 e_start=$($LFS getstripe -I 2 $name |
3469 awk '/lcme_extent.e_start:/ { print $2 }')
3470 [ $e_start -eq 2097152 ] ||
3471 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3473 e_end=$($LFS getstripe -I 2 $name |
3474 awk '/lcme_extent.e_end:/ { print $2 }')
3475 [ "$e_end" = "EOF" ] ||
3476 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3478 size=$(stat $name | awk '/Size:/ { print $2 }')
3479 [ $size -eq $((4096 * $bcount)) ] ||
3480 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3482 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3484 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3485 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3487 # The first stripe in each COMP was lost
3488 [ $failures -eq 512 ] ||
3489 error "(8.8) expect 512 IO failures, but get $failures"
3491 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3492 [ $size -eq $((4096 * $bcount)) ] ||
3493 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3495 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3496 error "(8.10) write to the LOV EA hole should fail"
3498 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3499 error "(8.11) write to normal stripe should NOT fail"
3501 echo "foo" >> $name && error "(8.12) append write $name should fail"
3503 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3505 touch $name || error "(8.14) cannot touch $name"
3507 rm -f $name || error "(8.15) cannot unlink $name"
3510 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3512 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3513 echo "Check $name, it contains f2's first stripe in each COMP"
3515 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3517 pattern=$($LFS getstripe -L -I 1 $name)
3518 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3519 error "(9.2.1) expect pattern flag hole, but got $pattern"
3521 pattern=$($LFS getstripe -L -I 2 $name)
3522 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3523 error "(9.2.2) expect pattern flag hole, but got $pattern"
3525 stripes=$($LFS getstripe -c -I 1 $name)
3526 [ $stripes -eq 2 ] ||
3527 error "(9.3.2) expect 2 stripes, but got $stripes"
3529 stripes=$($LFS getstripe -c -I 2 $name)
3530 [ $stripes -eq 2 ] ||
3531 error "(9.3.2) expect 2 stripes, but got $stripes"
3533 e_start=$($LFS getstripe -I 1 $name |
3534 awk '/lcme_extent.e_start:/ { print $2 }')
3535 [ $e_start -eq 0 ] ||
3536 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3538 e_end=$($LFS getstripe -I 1 $name |
3539 awk '/lcme_extent.e_end:/ { print $2 }')
3540 [ $e_end -eq 2097152 ] ||
3541 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3543 e_start=$($LFS getstripe -I 2 $name |
3544 awk '/lcme_extent.e_start:/ { print $2 }')
3545 [ $e_start -eq 2097152 ] ||
3546 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3548 e_end=$($LFS getstripe -I 2 $name |
3549 awk '/lcme_extent.e_end:/ { print $2 }')
3550 [ "$e_end" = "EOF" ] ||
3551 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3553 size=$(stat $name | awk '/Size:/ { print $2 }')
3554 # The second stripe in COMP was lost, so we do not know there
3555 # have ever been some data before. 'stat' will regard it as
3556 # no data on the lost stripe.
3558 [ $size -eq $((4096 * $bcount)) ] ||
3559 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3561 cat $name > /dev/null &&
3562 error "(9.7) normal read $name should fail"
3564 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3565 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3566 [ $failures -eq 512 ] ||
3567 error "(9.8) expect 256 IO failures, but get $failures"
3569 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3570 # The second stripe in COMP was lost, so we do not know there
3571 # have ever been some data before. Since 'dd' skip failure,
3572 # it will regard the lost stripe contains data.
3574 [ $size -eq $((4096 * $bcount)) ] ||
3575 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3577 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3578 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3580 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3581 error "(9.11) write to normal stripe should NOT fail"
3583 echo "foo" >> $name &&
3584 error "(9.12) append write $name should fail"
3586 chown $RUNAS_ID:$RUNAS_GID $name ||
3587 error "(9.13) cannot chown on $name"
3589 touch $name || error "(9.14) cannot touch $name"
3591 rm -f $name || error "(7.15) cannot unlink $name"
3593 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3596 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3597 skip "ignore the test if MDS is older than 2.5.59" && return
3599 check_mount_and_prep
3600 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3602 echo "Start all LFSCK components by default (-s 1)"
3603 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3604 error "Fail to start LFSCK"
3606 echo "namespace LFSCK should be in 'scanning-phase1' status"
3607 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3608 [ "$STATUS" == "scanning-phase1" ] ||
3609 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3611 echo "layout LFSCK should be in 'scanning-phase1' status"
3612 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3613 [ "$STATUS" == "scanning-phase1" ] ||
3614 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3616 echo "Stop all LFSCK components by default"
3617 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3618 error "Fail to stop LFSCK"
3620 run_test 21 "run all LFSCK components by default"
3623 [ $MDSCOUNT -lt 2 ] &&
3624 skip "We need at least 2 MDSes for this test" && return
3627 echo "The parent_A references the child directory via some name entry,"
3628 echo "but the child directory back references another parent_B via its"
3629 echo "".." name entry. The parent_B does not exist. Then the namespace"
3630 echo "LFSCK will repair the child directory's ".." name entry."
3633 check_mount_and_prep
3635 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3636 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3638 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3639 echo "The dummy's dotdot name entry references the guard."
3640 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3642 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3643 error "(3) Fail to mkdir on MDT0"
3644 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3646 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3648 echo "Trigger namespace LFSCK to repair unmatched pairs"
3649 $START_NAMESPACE -A -r ||
3650 error "(5) Fail to start LFSCK for namespace"
3652 wait_all_targets_blocked namespace completed 6
3654 local repaired=$($SHOW_NAMESPACE |
3655 awk '/^unmatched_pairs_repaired/ { print $2 }')
3656 [ $repaired -eq 1 ] ||
3657 error "(7) Fail to repair unmatched pairs: $repaired"
3659 echo "'ls' should success after namespace LFSCK repairing"
3660 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3661 error "(8) ls should success."
3663 run_test 22a "LFSCK can repair unmatched pairs (1)"
3666 [ $MDSCOUNT -lt 2 ] &&
3667 skip "We need at least 2 MDSes for this test" && return
3670 echo "The parent_A references the child directory via the name entry_B,"
3671 echo "but the child directory back references another parent_C via its"
3672 echo "".." name entry. The parent_C exists, but there is no the name"
3673 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3674 echo "the child directory's ".." name entry and its linkEA."
3677 check_mount_and_prep
3679 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3680 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3682 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3683 echo "and bad linkEA. The dummy's dotdot name entry references the"
3684 echo "guard. The dummy's linkEA references n non-exist name entry."
3685 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3687 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3688 error "(3) Fail to mkdir on MDT0"
3689 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3691 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3692 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3693 local dummyname=$($LFS fid2path $DIR $dummyfid)
3694 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3695 error "(4) fid2path works unexpectedly."
3697 echo "Trigger namespace LFSCK to repair unmatched pairs"
3698 $START_NAMESPACE -A -r ||
3699 error "(5) Fail to start LFSCK for namespace"
3701 wait_all_targets_blocked namespace completed 6
3703 local repaired=$($SHOW_NAMESPACE |
3704 awk '/^unmatched_pairs_repaired/ { print $2 }')
3705 [ $repaired -eq 1 ] ||
3706 error "(7) Fail to repair unmatched pairs: $repaired"
3708 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3709 local dummyname=$($LFS fid2path $DIR $dummyfid)
3710 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3711 error "(8) fid2path does not work"
3713 run_test 22b "LFSCK can repair unmatched pairs (2)"
3716 [ $MDSCOUNT -lt 2 ] &&
3717 skip "We need at least 2 MDSes for this test" && return
3720 echo "The name entry is there, but the MDT-object for such name "
3721 echo "entry does not exist. The namespace LFSCK should find out "
3722 echo "and repair the inconsistency as required."
3725 check_mount_and_prep
3727 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3728 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3730 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3731 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3732 do_facet mds2 $LCTL set_param fail_loc=0x1620
3733 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3734 do_facet mds2 $LCTL set_param fail_loc=0
3736 echo "'ls' should fail because of dangling name entry"
3737 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3739 echo "Trigger namespace LFSCK to find out dangling name entry"
3740 $START_NAMESPACE -A -r ||
3741 error "(5) Fail to start LFSCK for namespace"
3743 wait_all_targets_blocked namespace completed 6
3745 local repaired=$($SHOW_NAMESPACE |
3746 awk '/^dangling_repaired/ { print $2 }')
3747 [ $repaired -eq 1 ] ||
3748 error "(7) Fail to repair dangling name entry: $repaired"
3750 echo "'ls' should fail because not re-create MDT-object by default"
3751 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3753 echo "Trigger namespace LFSCK again to repair dangling name entry"
3754 $START_NAMESPACE -A -r -C ||
3755 error "(9) Fail to start LFSCK for namespace"
3757 wait_all_targets_blocked namespace completed 10
3759 repaired=$($SHOW_NAMESPACE |
3760 awk '/^dangling_repaired/ { print $2 }')
3761 [ $repaired -eq 1 ] ||
3762 error "(11) Fail to repair dangling name entry: $repaired"
3764 echo "'ls' should success after namespace LFSCK repairing"
3765 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3767 run_test 23a "LFSCK can repair dangling name entry (1)"
3771 echo "The objectA has multiple hard links, one of them corresponding"
3772 echo "to the name entry_B. But there is something wrong for the name"
3773 echo "entry_B and cause entry_B to references non-exist object_C."
3774 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3775 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3776 echo "comes to the second-stage scanning, it will find that the"
3777 echo "former re-creating object_C is not proper, and will try to"
3778 echo "replace the object_C with the real object_A."
3781 check_mount_and_prep
3783 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3784 $LFS path2fid $DIR/$tdir/d0
3786 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3788 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3789 $LFS path2fid $DIR/$tdir/d0/f0
3791 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3792 $LFS path2fid $DIR/$tdir/d0/f1
3794 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3795 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3797 if [ "$SEQ0" != "$SEQ1" ]; then
3798 # To guarantee that the f0 and f1 are in the same FID seq
3799 rm -f $DIR/$tdir/d0/f0 ||
3800 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3801 echo "dummy" > $DIR/$tdir/d0/f0 ||
3802 error "(3.2) Fail to touch on MDT0"
3803 $LFS path2fid $DIR/$tdir/d0/f0
3806 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3807 OID=$(printf %d $OID)
3809 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3810 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3811 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3812 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3813 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3815 # If there is creation after the dangling injection, it may re-use
3816 # the just released local object (inode) that is referenced by the
3817 # dangling name entry. It will fail the dangling injection.
3818 # So before deleting the target object for the dangling name entry,
3819 # remove some other objects to avoid the target object being reused
3820 # by some potential creations. LU-7429
3821 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3823 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3825 echo "'ls' should fail because of dangling name entry"
3826 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3827 error "(6) ls should fail."
3829 echo "Trigger namespace LFSCK to find out dangling name entry"
3830 $START_NAMESPACE -r -C ||
3831 error "(7) Fail to start LFSCK for namespace"
3833 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3834 mdd.${MDT_DEV}.lfsck_namespace |
3835 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3837 error "(8) unexpected status"
3840 local repaired=$($SHOW_NAMESPACE |
3841 awk '/^dangling_repaired/ { print $2 }')
3842 [ $repaired -eq 1 ] ||
3843 error "(9) Fail to repair dangling name entry: $repaired"
3845 repaired=$($SHOW_NAMESPACE |
3846 awk '/^multiple_linked_repaired/ { print $2 }')
3847 [ $repaired -eq 1 ] ||
3848 error "(10) Fail to drop the former created object: $repaired"
3850 local data=$(cat $DIR/$tdir/d0/foo)
3851 [ "$data" == "dummy" ] ||
3852 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3854 run_test 23b "LFSCK can repair dangling name entry (2)"
3858 echo "The objectA has multiple hard links, one of them corresponding"
3859 echo "to the name entry_B. But there is something wrong for the name"
3860 echo "entry_B and cause entry_B to references non-exist object_C."
3861 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3862 echo "as dangling, and re-create the lost object_C. And then others"
3863 echo "modified the re-created object_C. When the LFSCK comes to the"
3864 echo "second-stage scanning, it will find that the former re-creating"
3865 echo "object_C maybe wrong and try to replace the object_C with the"
3866 echo "real object_A. But because object_C has been modified, so the"
3867 echo "LFSCK cannot replace it."
3870 start_full_debug_logging
3872 check_mount_and_prep
3874 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3875 $LFS path2fid $DIR/$tdir/d0
3877 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3879 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3880 $LFS path2fid $DIR/$tdir/d0/f0
3882 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3883 $LFS path2fid $DIR/$tdir/d0/f1
3885 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3886 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3888 if [ "$SEQ0" != "$SEQ1" ]; then
3889 # To guarantee that the f0 and f1 are in the same FID seq
3890 rm -f $DIR/$tdir/d0/f0 ||
3891 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3892 echo "dummy" > $DIR/$tdir/d0/f0 ||
3893 error "(3.2) Fail to touch on MDT0"
3894 $LFS path2fid $DIR/$tdir/d0/f0
3897 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3898 OID=$(printf %d $OID)
3900 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3901 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3902 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3903 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3904 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3906 # If there is creation after the dangling injection, it may re-use
3907 # the just released local object (inode) that is referenced by the
3908 # dangling name entry. It will fail the dangling injection.
3909 # So before deleting the target object for the dangling name entry,
3910 # remove some other objects to avoid the target object being reused
3911 # by some potential creations. LU-7429
3912 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3914 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3916 echo "'ls' should fail because of dangling name entry"
3917 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3918 error "(6) ls should fail."
3920 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3921 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3923 echo "Trigger namespace LFSCK to find out dangling name entry"
3924 $START_NAMESPACE -r -C ||
3925 error "(7) Fail to start LFSCK for namespace"
3927 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3928 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3929 stat $DIR/$tdir/d0/foo
3931 error "(8) unexpected size"
3934 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3935 cancel_lru_locks osc
3937 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3939 mdd.${MDT_DEV}.lfsck_namespace |
3940 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3942 error "(10) unexpected status"
3945 stop_full_debug_logging
3947 local repaired=$($SHOW_NAMESPACE |
3948 awk '/^dangling_repaired/ { print $2 }')
3949 [ $repaired -eq 1 ] ||
3950 error "(11) Fail to repair dangling name entry: $repaired"
3952 local data=$(cat $DIR/$tdir/d0/foo)
3953 [ "$data" != "dummy" ] ||
3954 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3956 run_test 23c "LFSCK can repair dangling name entry (3)"
3959 [ $MDSCOUNT -lt 2 ] &&
3960 skip "We need at least 2 MDSes for this test" && return
3963 echo "Two MDT-objects back reference the same name entry via their"
3964 echo "each own linkEA entry, but the name entry only references one"
3965 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3966 echo "for the MDT-object that is not recognized. If such MDT-object"
3967 echo "has no other linkEA entry after the removing, then the LFSCK"
3968 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3971 check_mount_and_prep
3973 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3975 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3976 $LFS path2fid $DIR/$tdir/d0/guard
3978 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3979 $LFS path2fid $DIR/$tdir/d0/dummy
3982 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3983 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3985 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3988 touch $DIR/$tdir/d0/guard/foo ||
3989 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3991 echo "Inject failure stub on MDT0 to simulate the case that"
3992 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3993 echo "that references $DIR/$tdir/d0/guard/foo."
3994 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3995 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3996 echo "there with the same linkEA entry as another MDT-object"
3997 echo "$DIR/$tdir/d0/guard/foo has"
3999 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4000 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4001 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4002 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4003 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4004 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4005 rmdir $DIR/$tdir/d0/dummy/foo ||
4006 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4007 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4009 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4010 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4011 error "(6) stat successfully unexpectedly"
4013 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4014 $START_NAMESPACE -A -r ||
4015 error "(7) Fail to start LFSCK for namespace"
4017 wait_all_targets_blocked namespace completed 8
4019 local repaired=$($SHOW_NAMESPACE |
4020 awk '/^multiple_referenced_repaired/ { print $2 }')
4021 [ $repaired -eq 1 ] ||
4022 error "(9) Fail to repair multiple referenced name entry: $repaired"
4024 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4025 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4026 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4028 local cname="$cfid-$pfid-D-0"
4029 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4030 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4032 run_test 24 "LFSCK can repair multiple-referenced name entry"
4035 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4036 skip "Only support to inject failure on ldiskfs" && return
4039 echo "The file type in the name entry does not match the file type"
4040 echo "claimed by the referenced object. Then the LFSCK will update"
4041 echo "the file type in the name entry."
4044 check_mount_and_prep
4046 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4048 echo "Inject failure stub on MDT0 to simulate the case that"
4049 echo "the file type stored in the name entry is wrong."
4051 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4052 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4053 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4056 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4057 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4059 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4060 mdd.${MDT_DEV}.lfsck_namespace |
4061 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4063 error "(4) unexpected status"
4066 local repaired=$($SHOW_NAMESPACE |
4067 awk '/^bad_file_type_repaired/ { print $2 }')
4068 [ $repaired -eq 1 ] ||
4069 error "(5) Fail to repair bad file type in name entry: $repaired"
4071 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4073 run_test 25 "LFSCK can repair bad file type in the name entry"
4077 echo "The local name entry back referenced by the MDT-object is lost."
4078 echo "The namespace LFSCK will add the missing local name entry back"
4079 echo "to the normal namespace."
4082 check_mount_and_prep
4084 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4085 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4086 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4088 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4089 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4091 echo "Inject failure stub on MDT0 to simulate the case that"
4092 echo "foo's name entry will be removed, but the foo's object"
4093 echo "and its linkEA are kept in the system."
4095 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4096 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4097 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4098 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4100 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4101 error "(5) 'ls' should fail"
4103 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4104 $START_NAMESPACE -r -A ||
4105 error "(6) Fail to start LFSCK for namespace"
4107 wait_all_targets_blocked namespace completed 7
4109 local repaired=$($SHOW_NAMESPACE |
4110 awk '/^lost_dirent_repaired/ { print $2 }')
4111 [ $repaired -eq 1 ] ||
4112 error "(8) Fail to repair lost dirent: $repaired"
4114 ls -ail $DIR/$tdir/d0/foo ||
4115 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4117 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4118 [ "$foofid" == "$foofid2" ] ||
4119 error "(10) foo's FID changed: $foofid, $foofid2"
4121 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4124 [ $MDSCOUNT -lt 2 ] &&
4125 skip "We need at least 2 MDSes for this test" && return
4128 echo "The remote name entry back referenced by the MDT-object is lost."
4129 echo "The namespace LFSCK will add the missing remote name entry back"
4130 echo "to the normal namespace."
4133 check_mount_and_prep
4135 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4136 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4137 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4139 echo "Inject failure stub on MDT0 to simulate the case that"
4140 echo "foo's name entry will be removed, but the foo's object"
4141 echo "and its linkEA are kept in the system."
4143 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4145 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4146 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4148 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4149 error "(4) 'ls' should fail"
4151 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4152 $START_NAMESPACE -r -A ||
4153 error "(5) Fail to start LFSCK for namespace"
4155 wait_all_targets_blocked namespace completed 6
4157 local repaired=$($SHOW_NAMESPACE |
4158 awk '/^lost_dirent_repaired/ { print $2 }')
4159 [ $repaired -eq 1 ] ||
4160 error "(7) Fail to repair lost dirent: $repaired"
4162 ls -ail $DIR/$tdir/d0/foo ||
4163 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4165 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4166 [ "$foofid" == "$foofid2" ] ||
4167 error "(9) foo's FID changed: $foofid, $foofid2"
4169 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4173 echo "The local parent referenced by the MDT-object linkEA is lost."
4174 echo "The namespace LFSCK will re-create the lost parent as orphan."
4177 check_mount_and_prep
4179 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4180 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4181 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4182 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4184 echo "Inject failure stub on MDT0 to simulate the case that"
4185 echo "foo's name entry will be removed, but the foo's object"
4186 echo "and its linkEA are kept in the system. And then remove"
4187 echo "another hard link and the parent directory."
4189 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4191 rm -f $DIR/$tdir/d0/foo ||
4192 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4193 rm -f $DIR/$tdir/d0/dummy ||
4194 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4197 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4198 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4200 echo "Trigger namespace LFSCK to repair the lost parent"
4201 $START_NAMESPACE -r -A ||
4202 error "(6) Fail to start LFSCK for namespace"
4204 wait_all_targets_blocked namespace completed 7
4206 local repaired=$($SHOW_NAMESPACE |
4207 awk '/^lost_dirent_repaired/ { print $2 }')
4208 [ $repaired -eq 1 ] ||
4209 error "(8) Fail to repair lost dirent: $repaired"
4211 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4212 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4213 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4215 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4217 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4218 [ ! -z "$cname" ] ||
4219 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4221 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4224 [ $MDSCOUNT -lt 2 ] &&
4225 skip "We need at least 2 MDSes for this test" && return
4228 echo "The remote parent referenced by the MDT-object linkEA is lost."
4229 echo "The namespace LFSCK will re-create the lost parent as orphan."
4232 check_mount_and_prep
4234 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4235 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4237 $LFS path2fid $DIR/$tdir/d0
4239 echo "Inject failure stub on MDT0 to simulate the case that"
4240 echo "foo's name entry will be removed, but the foo's object"
4241 echo "and its linkEA are kept in the system. And then remove"
4242 echo "the parent directory."
4244 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4246 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4249 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4250 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4252 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4253 $START_NAMESPACE -r -A ||
4254 error "(6) Fail to start LFSCK for namespace"
4256 wait_all_targets_blocked namespace completed 7
4258 local repaired=$($SHOW_NAMESPACE |
4259 awk '/^lost_dirent_repaired/ { print $2 }')
4260 [ $repaired -eq 1 ] ||
4261 error "(8) Fail to repair lost dirent: $repaired"
4263 ls -ail $MOUNT/.lustre/lost+found/
4265 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4266 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4267 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4269 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4271 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4272 [ ! -z "$cname" ] ||
4273 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4275 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4278 [ $MDSCOUNT -lt 2 ] &&
4279 skip "The test needs at least 2 MDTs" && return
4282 echo "The target name entry is lost. The LFSCK should insert the"
4283 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4284 echo "the MDT (on which the orphan MDT-object resides) has ever"
4285 echo "failed to respond some name entry verification during the"
4286 echo "first stage-scanning, then the LFSCK should skip to handle"
4287 echo "orphan MDT-object on this MDT. But other MDTs should not"
4291 check_mount_and_prep
4292 $LFS mkdir -i 0 $DIR/$tdir/d1
4293 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4294 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4296 $LFS mkdir -i 1 $DIR/$tdir/d2
4297 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4298 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4300 echo "Inject failure stub on MDT0 to simulate the case that"
4301 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4302 echo "and its linkEA are kept in the system. And the case that"
4303 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4304 echo "and its linkEA are kept in the system."
4306 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4307 do_facet mds1 $LCTL set_param fail_loc=0x1624
4308 do_facet mds2 $LCTL set_param fail_loc=0x1624
4309 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4310 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4311 do_facet mds1 $LCTL set_param fail_loc=0
4312 do_facet mds2 $LCTL set_param fail_loc=0
4314 cancel_lru_locks mdc
4315 cancel_lru_locks osc
4317 echo "Inject failure, to simulate the MDT0 fail to handle"
4318 echo "MDT1 LFSCK request during the first-stage scanning."
4319 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4320 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4322 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4323 $START_NAMESPACE -r -A ||
4324 error "(3) Fail to start LFSCK for namespace"
4326 wait_update_facet mds1 "$LCTL get_param -n \
4327 mdd.$(facet_svc mds1).lfsck_namespace |
4328 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4329 error "(4) mds1 is not the expected 'partial'"
4332 wait_update_facet mds2 "$LCTL get_param -n \
4333 mdd.$(facet_svc mds2).lfsck_namespace |
4334 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4335 error "(5) mds2 is not the expected 'completed'"
4338 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4340 local repaired=$(do_facet mds1 $LCTL get_param -n \
4341 mdd.$(facet_svc mds1).lfsck_namespace |
4342 awk '/^lost_dirent_repaired/ { print $2 }')
4343 [ $repaired -eq 0 ] ||
4344 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4346 repaired=$(do_facet mds2 $LCTL get_param -n \
4347 mdd.$(facet_svc mds2).lfsck_namespace |
4348 awk '/^lost_dirent_repaired/ { print $2 }')
4349 [ $repaired -eq 1 ] ||
4350 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4352 echo "Trigger namespace LFSCK on all devices again to cleanup"
4353 $START_NAMESPACE -r -A ||
4354 error "(8) Fail to start LFSCK for namespace"
4356 wait_all_targets_blocked namespace completed 9
4358 local repaired=$(do_facet mds1 $LCTL get_param -n \
4359 mdd.$(facet_svc mds1).lfsck_namespace |
4360 awk '/^lost_dirent_repaired/ { print $2 }')
4361 [ $repaired -eq 1 ] ||
4362 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4364 repaired=$(do_facet mds2 $LCTL get_param -n \
4365 mdd.$(facet_svc mds2).lfsck_namespace |
4366 awk '/^lost_dirent_repaired/ { print $2 }')
4367 [ $repaired -eq 0 ] ||
4368 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4370 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4374 echo "The object's nlink attribute is larger than the object's known"
4375 echo "name entries count. The LFSCK will repair the object's nlink"
4376 echo "attribute to match the known name entries count"
4379 check_mount_and_prep
4381 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4382 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4384 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4385 echo "nlink attribute is larger than its name entries count."
4387 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4389 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4390 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4393 cancel_lru_locks mdc
4394 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4395 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4397 echo "Trigger namespace LFSCK to repair the nlink count"
4398 $START_NAMESPACE -r -A ||
4399 error "(5) Fail to start LFSCK for namespace"
4401 wait_all_targets_blocked namespace completed 6
4403 local repaired=$($SHOW_NAMESPACE |
4404 awk '/^nlinks_repaired/ { print $2 }')
4405 [ $repaired -eq 1 ] ||
4406 error "(7) Fail to repair nlink count: $repaired"
4408 cancel_lru_locks mdc
4409 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4410 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4412 # Disable 29a, we only allow nlink to be updated if the known linkEA
4413 # entries is larger than nlink count.
4415 #run_test 29a "LFSCK can repair bad nlink count (1)"
4419 echo "The object's nlink attribute is smaller than the object's known"
4420 echo "name entries count. The LFSCK will repair the object's nlink"
4421 echo "attribute to match the known name entries count"
4424 check_mount_and_prep
4426 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4427 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4429 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4430 echo "nlink attribute is smaller than its name entries count."
4432 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4433 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4434 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4435 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4436 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4438 cancel_lru_locks mdc
4439 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4440 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4442 echo "Trigger namespace LFSCK to repair the nlink count"
4443 $START_NAMESPACE -r -A ||
4444 error "(5) Fail to start LFSCK for namespace"
4446 wait_all_targets_blocked namespace completed 6
4448 local repaired=$($SHOW_NAMESPACE |
4449 awk '/^nlinks_repaired/ { print $2 }')
4450 [ $repaired -eq 1 ] ||
4451 error "(7) Fail to repair nlink count: $repaired"
4453 cancel_lru_locks mdc
4454 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4455 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4457 run_test 29b "LFSCK can repair bad nlink count (2)"
4462 echo "The namespace LFSCK will create many hard links to the target"
4463 echo "file as to exceed the linkEA size limitation. Under such case"
4464 echo "the linkEA will be marked as overflow that will prevent the"
4465 echo "target file to be migrated. Then remove some hard links to"
4466 echo "make the left hard links to be held within the linkEA size"
4467 echo "limitation. But before the namespace LFSCK adding all the"
4468 echo "missed linkEA entries back, the overflow mark (timestamp)"
4469 echo "will not be cleared."
4472 check_mount_and_prep
4474 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4475 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4476 error "(0.2) Fail to mkdir"
4477 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4478 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4480 # define MAX_LINKEA_SIZE 4096
4481 # sizeof(link_ea_header) = 24
4482 # sizeof(link_ea_entry) = 18
4483 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4484 # (sizeof(link_ea_entry) + name_length))
4485 # If the average name length is 12 bytes, then 150 hard links
4486 # is totally enough to overflow the linkEA
4487 echo "Create 150 hard links should succeed although the linkEA overflow"
4488 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4489 error "(2) Fail to hard link"
4491 cancel_lru_locks mdc
4492 if [ $MDSCOUNT -ge 2 ]; then
4493 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4494 error "(3.1) Migrate failure"
4496 echo "The object with linkEA overflow should NOT be migrated"
4497 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4498 [ "$newfid" == "$oldfid" ] ||
4499 error "(3.2) Migrate should fail: $newfid != $oldfid"
4502 # Remove 100 hard links, then the linkEA should have space
4503 # to hold the missed linkEA entries.
4504 echo "Remove 100 hard links to save space for the missed linkEA entries"
4505 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4507 if [ $MDSCOUNT -ge 2 ]; then
4508 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4509 error "(5.1) Migrate failure"
4511 # The overflow timestamp is still there, so migration will fail.
4512 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4513 [ "$newfid" == "$oldfid" ] ||
4514 error "(5.2) Migrate should fail: $newfid != $oldfid"
4517 # sleep 3 seconds to guarantee that the overflow is recognized
4520 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4521 $START_NAMESPACE -r -A ||
4522 error "(6) Fail to start LFSCK for namespace"
4524 wait_all_targets_blocked namespace completed 7
4526 local repaired=$($SHOW_NAMESPACE |
4527 awk '/^linkea_overflow_cleared/ { print $2 }')
4528 [ $repaired -eq 1 ] ||
4529 error "(8) Fail to clear linkea overflow: $repaired"
4531 repaired=$($SHOW_NAMESPACE |
4532 awk '/^nlinks_repaired/ { print $2 }')
4533 [ $repaired -eq 0 ] ||
4534 error "(9) Unexpected nlink repaired: $repaired"
4536 if [ $MDSCOUNT -ge 2 ]; then
4537 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4538 error "(10.1) Migrate failure"
4540 # Migration should succeed after clear the overflow timestamp.
4541 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4542 [ "$newfid" != "$oldfid" ] ||
4543 error "(10.2) Migrate should succeed"
4545 ls -l $DIR/$tdir/foo > /dev/null ||
4546 error "(11) 'ls' failed after migration"
4549 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4550 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4552 run_test 29c "verify linkEA size limitation"
4555 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4556 skip "Only support backend /lost+found for ldiskfs" && return
4559 echo "The namespace LFSCK will move the orphans from backend"
4560 echo "/lost+found directory to normal client visible namespace"
4561 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4564 check_mount_and_prep
4566 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4567 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4569 echo "Inject failure stub on MDT0 to simulate the case that"
4570 echo "directory d0 has no linkEA entry, then the LFSCK will"
4571 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4573 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4574 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4575 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4576 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4578 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4579 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4581 echo "Inject failure stub on MDT0 to simulate the case that the"
4582 echo "object's name entry will be removed, but not destroy the"
4583 echo "object. Then backend e2fsck will handle it as orphan and"
4584 echo "add them into the backend /lost+found directory."
4586 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4588 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4589 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4590 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4591 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4592 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4594 umount_client $MOUNT || error "(10) Fail to stop client!"
4596 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4599 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4600 error "(12) Fail to run e2fsck"
4602 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4603 error "(13) Fail to start MDT0"
4605 echo "Trigger namespace LFSCK to recover backend orphans"
4606 $START_NAMESPACE -r -A ||
4607 error "(14) Fail to start LFSCK for namespace"
4609 wait_all_targets_blocked namespace completed 15
4611 local repaired=$($SHOW_NAMESPACE |
4612 awk '/^local_lost_found_moved/ { print $2 }')
4613 [ $repaired -ge 4 ] ||
4614 error "(16) Fail to recover backend orphans: $repaired"
4616 mount_client $MOUNT || error "(17) Fail to start client!"
4618 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4620 ls -ail $MOUNT/.lustre/lost+found/
4622 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4623 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4624 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4626 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4628 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4629 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4631 stat ${cname}/d1 || error "(21) d0 is not recovered"
4632 stat ${cname}/f1 || error "(22) f1 is not recovered"
4634 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4637 [ $MDSCOUNT -lt 2 ] &&
4638 skip "The test needs at least 2 MDTs" && return
4641 echo "For the name entry under a striped directory, if the name"
4642 echo "hash does not match the shard, then the LFSCK will repair"
4643 echo "the bad name entry"
4646 check_mount_and_prep
4648 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4649 error "(1) Fail to create striped directory"
4651 echo "Inject failure stub on client to simulate the case that"
4652 echo "some name entry should be inserted into other non-first"
4653 echo "shard, but inserted into the first shard by wrong"
4655 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4656 $LCTL set_param fail_loc=0x1628 fail_val=0
4657 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4658 error "(2) Fail to create file under striped directory"
4659 $LCTL set_param fail_loc=0 fail_val=0
4661 echo "Trigger namespace LFSCK to repair bad name hash"
4662 $START_NAMESPACE -r -A ||
4663 error "(3) Fail to start LFSCK for namespace"
4665 wait_all_targets_blocked namespace completed 4
4667 local repaired=$($SHOW_NAMESPACE |
4668 awk '/^name_hash_repaired/ { print $2 }')
4669 [ $repaired -ge 1 ] ||
4670 error "(5) Fail to repair bad name hash: $repaired"
4672 umount_client $MOUNT || error "(6) umount failed"
4673 mount_client $MOUNT || error "(7) mount failed"
4675 for ((i = 0; i < $MDSCOUNT; i++)); do
4676 stat $DIR/$tdir/striped_dir/d$i ||
4677 error "(8) Fail to stat d$i after LFSCK"
4678 rmdir $DIR/$tdir/striped_dir/d$i ||
4679 error "(9) Fail to unlink d$i after LFSCK"
4682 rmdir $DIR/$tdir/striped_dir ||
4683 error "(10) Fail to remove the striped directory after LFSCK"
4685 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4688 [ $MDSCOUNT -lt 2 ] &&
4689 skip "The test needs at least 2 MDTs" && return
4692 echo "For the name entry under a striped directory, if the name"
4693 echo "hash does not match the shard, then the LFSCK will repair"
4694 echo "the bad name entry"
4697 check_mount_and_prep
4699 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4700 error "(1) Fail to create striped directory"
4702 echo "Inject failure stub on client to simulate the case that"
4703 echo "some name entry should be inserted into other non-second"
4704 echo "shard, but inserted into the secod shard by wrong"
4706 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4707 $LCTL set_param fail_loc=0x1628 fail_val=1
4708 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4709 error "(2) Fail to create file under striped directory"
4710 $LCTL set_param fail_loc=0 fail_val=0
4712 echo "Trigger namespace LFSCK to repair bad name hash"
4713 $START_NAMESPACE -r -A ||
4714 error "(3) Fail to start LFSCK for namespace"
4716 wait_all_targets_blocked namespace completed 4
4718 local repaired=$(do_facet mds2 $LCTL get_param -n \
4719 mdd.$(facet_svc mds2).lfsck_namespace |
4720 awk '/^name_hash_repaired/ { print $2 }')
4721 [ $repaired -ge 1 ] ||
4722 error "(5) Fail to repair bad name hash: $repaired"
4724 umount_client $MOUNT || error "(6) umount failed"
4725 mount_client $MOUNT || error "(7) mount failed"
4727 for ((i = 0; i < $MDSCOUNT; i++)); do
4728 stat $DIR/$tdir/striped_dir/d$i ||
4729 error "(8) Fail to stat d$i after LFSCK"
4730 rmdir $DIR/$tdir/striped_dir/d$i ||
4731 error "(9) Fail to unlink d$i after LFSCK"
4734 rmdir $DIR/$tdir/striped_dir ||
4735 error "(10) Fail to remove the striped directory after LFSCK"
4737 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4740 [ $MDSCOUNT -lt 2 ] &&
4741 skip "The test needs at least 2 MDTs" && return
4744 echo "For some reason, the master MDT-object of the striped directory"
4745 echo "may lost its master LMV EA. If nobody created files under the"
4746 echo "master directly after the master LMV EA lost, then the LFSCK"
4747 echo "should re-generate the master LMV EA."
4750 check_mount_and_prep
4752 echo "Inject failure stub on MDT0 to simulate the case that the"
4753 echo "master MDT-object of the striped directory lost the LMV EA."
4755 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4757 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4758 error "(1) Fail to create striped directory"
4759 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4761 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4762 $START_NAMESPACE -r -A ||
4763 error "(2) Fail to start LFSCK for namespace"
4765 wait_all_targets_blocked namespace completed 3
4767 local repaired=$($SHOW_NAMESPACE |
4768 awk '/^striped_dirs_repaired/ { print $2 }')
4769 [ $repaired -eq 1 ] ||
4770 error "(4) Fail to re-generate master LMV EA: $repaired"
4772 umount_client $MOUNT || error "(5) umount failed"
4773 mount_client $MOUNT || error "(6) mount failed"
4775 local empty=$(ls $DIR/$tdir/striped_dir/)
4776 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4778 rmdir $DIR/$tdir/striped_dir ||
4779 error "(8) Fail to remove the striped directory after LFSCK"
4781 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4784 [ $MDSCOUNT -lt 2 ] &&
4785 skip "The test needs at least 2 MDTs" && return
4788 echo "For some reason, the master MDT-object of the striped directory"
4789 echo "may lost its master LMV EA. If somebody created files under the"
4790 echo "master directly after the master LMV EA lost, then the LFSCK"
4791 echo "should NOT re-generate the master LMV EA, instead, it should"
4792 echo "change the broken striped dirctory as read-only to prevent"
4793 echo "further damage"
4796 check_mount_and_prep
4798 echo "Inject failure stub on MDT0 to simulate the case that the"
4799 echo "master MDT-object of the striped directory lost the LMV EA."
4801 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4802 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4803 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4804 error "(1) Fail to create striped directory"
4805 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4807 umount_client $MOUNT || error "(2) umount failed"
4808 mount_client $MOUNT || error "(3) mount failed"
4810 touch $DIR/$tdir/striped_dir/dummy ||
4811 error "(4) Fail to touch under broken striped directory"
4813 echo "Trigger namespace LFSCK to find out the inconsistency"
4814 $START_NAMESPACE -r -A ||
4815 error "(5) Fail to start LFSCK for namespace"
4817 wait_all_targets_blocked namespace completed 6
4819 local repaired=$($SHOW_NAMESPACE |
4820 awk '/^striped_dirs_repaired/ { print $2 }')
4821 [ $repaired -eq 0 ] ||
4822 error "(7) Re-generate master LMV EA unexpected: $repaired"
4824 stat $DIR/$tdir/striped_dir/dummy ||
4825 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4827 touch $DIR/$tdir/striped_dir/foo &&
4828 error "(9) The broken striped directory should be read-only"
4830 chattr -i $DIR/$tdir/striped_dir ||
4831 error "(10) Fail to chattr on the broken striped directory"
4833 rmdir $DIR/$tdir/striped_dir ||
4834 error "(11) Fail to remove the striped directory after LFSCK"
4836 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4839 [ $MDSCOUNT -lt 2 ] &&
4840 skip "The test needs at least 2 MDTs" && return
4843 echo "For some reason, the slave MDT-object of the striped directory"
4844 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4845 echo "slave LMV EA."
4848 check_mount_and_prep
4850 echo "Inject failure stub on MDT0 to simulate the case that the"
4851 echo "slave MDT-object (that resides on the same MDT as the master"
4852 echo "MDT-object resides on) lost the LMV EA."
4854 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4856 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4857 error "(1) Fail to create striped directory"
4858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4860 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4861 $START_NAMESPACE -r -A ||
4862 error "(2) Fail to start LFSCK for namespace"
4864 wait_all_targets_blocked namespace completed 3
4866 local repaired=$($SHOW_NAMESPACE |
4867 awk '/^striped_shards_repaired/ { print $2 }')
4868 [ $repaired -eq 1 ] ||
4869 error "(4) Fail to re-generate slave LMV EA: $repaired"
4871 rmdir $DIR/$tdir/striped_dir ||
4872 error "(5) Fail to remove the striped directory after LFSCK"
4874 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4877 [ $MDSCOUNT -lt 2 ] &&
4878 skip "The test needs at least 2 MDTs" && return
4881 echo "For some reason, the slave MDT-object of the striped directory"
4882 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4883 echo "slave LMV EA."
4886 check_mount_and_prep
4888 echo "Inject failure stub on MDT0 to simulate the case that the"
4889 echo "slave MDT-object (that resides on different MDT as the master"
4890 echo "MDT-object resides on) lost the LMV EA."
4892 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4894 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4895 error "(1) Fail to create striped directory"
4896 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4898 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4899 $START_NAMESPACE -r -A ||
4900 error "(2) Fail to start LFSCK for namespace"
4902 wait_all_targets_blocked namespace completed 3
4904 local repaired=$(do_facet mds2 $LCTL get_param -n \
4905 mdd.$(facet_svc mds2).lfsck_namespace |
4906 awk '/^striped_shards_repaired/ { print $2 }')
4907 [ $repaired -eq 1 ] ||
4908 error "(4) Fail to re-generate slave LMV EA: $repaired"
4910 rmdir $DIR/$tdir/striped_dir ||
4911 error "(5) Fail to remove the striped directory after LFSCK"
4913 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4916 [ $MDSCOUNT -lt 2 ] &&
4917 skip "The test needs at least 2 MDTs" && return
4920 echo "For some reason, the stripe index in the slave LMV EA is"
4921 echo "corrupted. The LFSCK should repair the slave LMV EA."
4924 check_mount_and_prep
4926 echo "Inject failure stub on MDT0 to simulate the case that the"
4927 echo "slave LMV EA on the first shard of the striped directory"
4928 echo "claims the same index as the second shard claims"
4930 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4931 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4932 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4933 error "(1) Fail to create striped directory"
4934 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4936 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4937 $START_NAMESPACE -r -A ||
4938 error "(2) Fail to start LFSCK for namespace"
4940 wait_all_targets_blocked namespace completed 3
4942 local repaired=$($SHOW_NAMESPACE |
4943 awk '/^striped_shards_repaired/ { print $2 }')
4944 [ $repaired -eq 1 ] ||
4945 error "(4) Fail to repair slave LMV EA: $repaired"
4947 umount_client $MOUNT || error "(5) umount failed"
4948 mount_client $MOUNT || error "(6) mount failed"
4950 touch $DIR/$tdir/striped_dir/foo ||
4951 error "(7) Fail to touch file after the LFSCK"
4953 rm -f $DIR/$tdir/striped_dir/foo ||
4954 error "(8) Fail to unlink file after the LFSCK"
4956 rmdir $DIR/$tdir/striped_dir ||
4957 error "(9) Fail to remove the striped directory after LFSCK"
4959 run_test 31g "Repair the corrupted slave LMV EA"
4962 [ $MDSCOUNT -lt 2 ] &&
4963 skip "The test needs at least 2 MDTs" && return
4966 echo "For some reason, the shard's name entry in the striped"
4967 echo "directory may be corrupted. The LFSCK should repair the"
4968 echo "bad shard's name entry."
4971 check_mount_and_prep
4973 echo "Inject failure stub on MDT0 to simulate the case that the"
4974 echo "first shard's name entry in the striped directory claims"
4975 echo "the same index as the second shard's name entry claims."
4977 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4978 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4979 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4980 error "(1) Fail to create striped directory"
4981 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4983 echo "Trigger namespace LFSCK to repair the shard's name entry"
4984 $START_NAMESPACE -r -A ||
4985 error "(2) Fail to start LFSCK for namespace"
4987 wait_all_targets_blocked namespace completed 3
4989 local repaired=$($SHOW_NAMESPACE |
4990 awk '/^dirent_repaired/ { print $2 }')
4991 [ $repaired -eq 1 ] ||
4992 error "(4) Fail to repair shard's name entry: $repaired"
4994 umount_client $MOUNT || error "(5) umount failed"
4995 mount_client $MOUNT || error "(6) mount failed"
4997 touch $DIR/$tdir/striped_dir/foo ||
4998 error "(7) Fail to touch file after the LFSCK"
5000 rm -f $DIR/$tdir/striped_dir/foo ||
5001 error "(8) Fail to unlink file after the LFSCK"
5003 rmdir $DIR/$tdir/striped_dir ||
5004 error "(9) Fail to remove the striped directory after LFSCK"
5006 run_test 31h "Repair the corrupted shard's name entry"
5011 umount_client $MOUNT
5013 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
5014 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5015 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
5017 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5018 [ "$STATUS" == "scanning-phase1" ] ||
5019 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
5022 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
5024 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5028 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5030 run_test 32 "stop LFSCK when some OST failed"
5036 $START_LAYOUT --dryrun -o -r ||
5037 error "(1) Fail to start layout LFSCK"
5038 wait_all_targets_blocked layout completed 2
5040 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5041 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5042 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5044 $START_NAMESPACE -e abort -A -r ||
5045 error "(4) Fail to start namespace LFSCK"
5046 wait_all_targets_blocked namespace completed 5
5048 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5049 [ "$PARAMS" == "failout,all_targets" ] ||
5050 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5052 run_test 33 "check LFSCK paramters"
5054 # restore MDS/OST size
5055 MDSSIZE=${SAVED_MDSSIZE}
5056 OSTSIZE=${SAVED_OSTSIZE}
5057 OSTCOUNT=${SAVED_OSTCOUNT}
5059 # cleanup the system at last