3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test 6380
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT 4 5 "
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 SAVED_MDSSIZE=${MDSSIZE}
28 SAVED_OSTSIZE=${OSTSIZE}
29 SAVED_OSTCOUNT=${OSTCOUNT}
30 # use small MDS + OST size to speed formatting time
31 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
34 # no need too many OSTs, to reduce the format/start/stop overhead
35 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
37 # build up a clean test environment.
41 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
42 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
45 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
48 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
51 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
54 # DNE does not support striped directory on zfs-based backend yet.
55 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
56 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
60 MDT_DEV="${FSNAME}-MDT0000"
61 OST_DEV="${FSNAME}-OST0000"
62 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
63 START_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
65 START_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
67 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
68 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
69 SHOW_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
71 SHOW_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
73 SHOW_LAYOUT_ON_OST="do_facet ost1 \
74 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
75 MOUNT_OPTS_SCRUB="-o user_xattr"
76 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
85 echo "preparing... $nfiles * $ndirs files will be created $(date)."
86 if [ ! -z $igif ]; then
87 #define OBD_FAIL_FID_IGIF 0x1504
88 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
91 cp $LUSTRE/tests/*.sh $DIR/$tdir/
92 if [ $ndirs -gt 0 ]; then
93 createmany -d $DIR/$tdir/d $ndirs
94 createmany -m $DIR/$tdir/f $ndirs
95 if [ $nfiles -gt 0 ]; then
96 for ((i = 0; i < $ndirs; i++)); do
97 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
98 /dev/null || error "createmany $nfiles"
101 createmany -d $DIR/$tdir/e $ndirs
104 if [ ! -z $igif ]; then
105 touch $DIR/$tdir/dummy
106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
109 echo "prepared $(date)."
115 #define OBD_FAIL_LFSCK_DELAY1 0x1600
116 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
117 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
119 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
121 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
122 [ "$STATUS" == "scanning-phase1" ] ||
123 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
125 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
127 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
128 [ "$STATUS" == "stopped" ] ||
129 error "(6) Expect 'stopped', but got '$STATUS'"
131 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
133 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
134 [ "$STATUS" == "scanning-phase1" ] ||
135 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
138 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
139 mdd.${MDT_DEV}.lfsck_namespace |
140 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
142 error "(9) unexpected status"
145 local repaired=$($SHOW_NAMESPACE |
146 awk '/^updated_phase1/ { print $2 }')
147 [ $repaired -eq 0 ] ||
148 error "(10) Expect nothing to be repaired, but got: $repaired"
150 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
151 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
152 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
153 mdd.${MDT_DEV}.lfsck_namespace |
154 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
156 error "(12) unexpected status"
159 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
160 [ $((scanned1 + 1)) -eq $scanned2 ] ||
161 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
163 echo "stopall, should NOT crash LU-3649"
164 stopall || error "(14) Fail to stopall"
166 run_test 0 "Control LFSCK manually"
169 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
170 skip "OI Scrub not implemented for ZFS" && return
174 #define OBD_FAIL_FID_INDIR 0x1501
175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
176 touch $DIR/$tdir/dummy
178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
180 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
181 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
182 mdd.${MDT_DEV}.lfsck_namespace |
183 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
185 error "(4) unexpected status"
188 local repaired=$($SHOW_NAMESPACE |
189 awk '/^dirent_repaired/ { print $2 }')
190 # for interop with old server
191 [ -z "$repaired" ] &&
192 repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 1 ] ||
196 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
198 mount_client $MOUNT || error "(6) Fail to start client!"
200 #define OBD_FAIL_FID_LOOKUP 0x1505
201 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
202 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
206 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
210 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
211 skip "OI Scrub not implemented for ZFS" && return
215 #define OBD_FAIL_FID_INLMA 0x1502
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
217 touch $DIR/$tdir/dummy
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
221 #define OBD_FAIL_FID_NOLMA 0x1506
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
223 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
224 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
225 mdd.${MDT_DEV}.lfsck_namespace |
226 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
228 error "(4) unexpected status"
231 local repaired=$($SHOW_NAMESPACE |
232 awk '/^dirent_repaired/ { print $2 }')
233 # for interop with old server
234 [ -z "$repaired" ] &&
235 repaired=$($SHOW_NAMESPACE |
236 awk '/^updated_phase1/ { print $2 }')
238 [ $repaired -eq 1 ] ||
239 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
242 mount_client $MOUNT || error "(6) Fail to start client!"
244 #define OBD_FAIL_FID_LOOKUP 0x1505
245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
246 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
250 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
255 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
257 touch $DIR/$tdir/dummy
259 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
261 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
262 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
263 mdd.${MDT_DEV}.lfsck_namespace |
264 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
266 error "(4) unexpected status"
269 local repaired=$($SHOW_NAMESPACE |
270 awk '/^linkea_repaired/ { print $2 }')
271 # for interop with old server
272 [ -z "$repaired" ] &&
273 repaired=$($SHOW_NAMESPACE |
274 awk '/^updated_phase2/ { print $2 }')
276 [ $repaired -eq 1 ] ||
277 error "(5) Fail to repair crashed linkEA: $repaired"
279 mount_client $MOUNT || error "(6) Fail to start client!"
281 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
282 error "(7) Fail to stat $DIR/$tdir/dummy"
284 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
285 local dummyname=$($LFS fid2path $DIR $dummyfid)
286 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
287 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
289 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
295 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
297 touch $DIR/$tdir/dummy
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
303 mdd.${MDT_DEV}.lfsck_namespace |
304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
306 error "(4) unexpected status"
309 local repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase2/ { print $2 }')
311 [ $repaired -eq 1 ] ||
312 error "(5) Fail to repair crashed linkEA: $repaired"
314 mount_client $MOUNT || error "(6) Fail to start client!"
316 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
317 error "(7) Fail to stat $DIR/$tdir/dummy"
319 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
320 local dummyname=$($LFS fid2path $DIR $dummyfid)
321 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
322 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
324 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
330 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
332 touch $DIR/$tdir/dummy
334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
336 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
337 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
338 mdd.${MDT_DEV}.lfsck_namespace |
339 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
341 error "(4) unexpected status"
344 local repaired=$($SHOW_NAMESPACE |
345 awk '/^updated_phase2/ { print $2 }')
346 [ $repaired -eq 1 ] ||
347 error "(5) Fail to repair crashed linkEA: $repaired"
349 mount_client $MOUNT || error "(6) Fail to start client!"
351 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
352 error "(7) Fail to stat $DIR/$tdir/dummy"
354 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
355 local dummyname=$($LFS fid2path $DIR $dummyfid)
356 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
357 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
359 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
365 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
367 touch $DIR/$tdir/dummy
369 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
371 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
372 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
373 mdd.${MDT_DEV}.lfsck_namespace |
374 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
376 error "(4) unexpected status"
379 local repaired=$($SHOW_NAMESPACE |
380 awk '/^linkea_repaired/ { print $2 }')
381 [ $repaired -eq 1 ] ||
382 error "(5) Fail to repair crashed linkEA: $repaired"
384 mount_client $MOUNT || error "(6) Fail to start client!"
386 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
387 error "(7) Fail to stat $DIR/$tdir/dummy"
389 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
390 local dummyname=$($LFS fid2path $DIR $dummyfid)
391 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
392 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
394 run_test 2d "LFSCK can recover the missing linkEA entry"
398 [ $MDSCOUNT -lt 2 ] &&
399 skip "We need at least 2 MDSes for this test" && return
403 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
405 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
406 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
407 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
410 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
412 mdd.${MDT_DEV}.lfsck_namespace |
413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
415 error "(4) unexpected status"
418 local repaired=$($SHOW_NAMESPACE |
419 awk '/^linkea_repaired/ { print $2 }')
420 [ $repaired -eq 1 ] ||
421 error "(5) Fail to repair crashed linkEA: $repaired"
423 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
424 local name=$($LFS fid2path $DIR $fid)
425 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
426 error "(6) Fail to repair linkEA: $fid $name"
428 run_test 2e "namespace LFSCK can verify remote object linkEA"
434 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
435 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
436 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
438 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
439 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
440 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
442 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
444 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
446 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
448 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
452 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
453 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
454 mdd.${MDT_DEV}.lfsck_namespace |
455 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
457 error "(10) unexpected status"
460 local checked=$($SHOW_NAMESPACE |
461 awk '/^checked_phase2/ { print $2 }')
462 [ $checked -ge 4 ] ||
463 error "(11) Fail to check multiple-linked object: $checked"
465 local repaired=$($SHOW_NAMESPACE |
466 awk '/^multiple_linked_repaired/ { print $2 }')
467 [ $repaired -ge 2 ] ||
468 error "(12) Fail to repair multiple-linked object: $repaired"
470 run_test 3 "LFSCK can verify multiple-linked objects"
474 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
475 skip "OI Scrub not implemented for ZFS" && return
478 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
479 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
481 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
482 echo "start $SINGLEMDS with disabling OI scrub"
483 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
484 error "(2) Fail to start MDS!"
486 #define OBD_FAIL_LFSCK_DELAY2 0x1601
487 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
488 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
489 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
490 mdd.${MDT_DEV}.lfsck_namespace |
491 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
493 error "(5) unexpected status"
496 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
497 [ "$STATUS" == "scanning-phase1" ] ||
498 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
502 mdd.${MDT_DEV}.lfsck_namespace |
503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
505 error "(7) unexpected status"
508 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
509 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^dirent_repaired/ { print $2 }')
513 # for interop with old server
514 [ -z "$repaired" ] &&
515 repaired=$($SHOW_NAMESPACE |
516 awk '/^updated_phase1/ { print $2 }')
518 [ $repaired -ge 9 ] ||
519 error "(9) Fail to re-generate FID-in-dirent: $repaired"
521 mount_client $MOUNT || error "(10) Fail to start client!"
523 #define OBD_FAIL_FID_LOOKUP 0x1505
524 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
525 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
526 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
528 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
532 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
533 skip "OI Scrub not implemented for ZFS" && return
536 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
537 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
539 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
540 echo "start $SINGLEMDS with disabling OI scrub"
541 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
542 error "(2) Fail to start MDS!"
544 #define OBD_FAIL_LFSCK_DELAY2 0x1601
545 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
546 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
548 mdd.${MDT_DEV}.lfsck_namespace |
549 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
551 error "(5) unexpected status"
554 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
555 [ "$STATUS" == "scanning-phase1" ] ||
556 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
558 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
559 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
560 mdd.${MDT_DEV}.lfsck_namespace |
561 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
563 error "(7) unexpected status"
566 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
567 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
569 local repaired=$($SHOW_NAMESPACE |
570 awk '/^dirent_repaired/ { print $2 }')
571 # for interop with old server
572 [ -z "$repaired" ] &&
573 repaired=$($SHOW_NAMESPACE |
574 awk '/^updated_phase1/ { print $2 }')
576 [ $repaired -ge 2 ] ||
577 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
579 mount_client $MOUNT || error "(10) Fail to start client!"
581 #define OBD_FAIL_FID_LOOKUP 0x1505
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
583 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
585 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
588 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
589 local dummyname=$($LFS fid2path $DIR $dummyfid)
590 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
591 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
593 run_test 5 "LFSCK can handle IGIF object upgrading"
598 #define OBD_FAIL_LFSCK_DELAY1 0x1600
599 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
600 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
602 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
603 [ "$STATUS" == "scanning-phase1" ] ||
604 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
606 # Sleep 3 sec to guarantee at least one object processed by LFSCK
608 # Fail the LFSCK to guarantee there is at least one checkpoint
609 #define OBD_FAIL_LFSCK_FATAL1 0x1608
610 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
611 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
612 mdd.${MDT_DEV}.lfsck_namespace |
613 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
615 error "(4) unexpected status"
618 local POS0=$($SHOW_NAMESPACE |
619 awk '/^last_checkpoint_position/ { print $2 }' |
622 #define OBD_FAIL_LFSCK_DELAY1 0x1600
623 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
624 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
626 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
627 [ "$STATUS" == "scanning-phase1" ] ||
628 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
630 local POS1=$($SHOW_NAMESPACE |
631 awk '/^latest_start_position/ { print $2 }' |
633 [[ $POS0 -lt $POS1 ]] ||
634 error "(7) Expect larger than: $POS0, but got $POS1"
636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
637 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
638 mdd.${MDT_DEV}.lfsck_namespace |
639 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
641 error "(8) unexpected status"
644 run_test 6a "LFSCK resumes from last checkpoint (1)"
649 #define OBD_FAIL_LFSCK_DELAY2 0x1601
650 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
651 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
653 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
654 [ "$STATUS" == "scanning-phase1" ] ||
655 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
657 # Sleep 5 sec to guarantee that we are in the directory scanning
659 # Fail the LFSCK to guarantee there is at least one checkpoint
660 #define OBD_FAIL_LFSCK_FATAL2 0x1609
661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
662 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
663 mdd.${MDT_DEV}.lfsck_namespace |
664 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
666 error "(4) unexpected status"
669 local O_POS0=$($SHOW_NAMESPACE |
670 awk '/^last_checkpoint_position/ { print $2 }' |
673 local D_POS0=$($SHOW_NAMESPACE |
674 awk '/^last_checkpoint_position/ { print $4 }')
676 #define OBD_FAIL_LFSCK_DELAY2 0x1601
677 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
678 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
680 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
681 [ "$STATUS" == "scanning-phase1" ] ||
682 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
684 local O_POS1=$($SHOW_NAMESPACE |
685 awk '/^latest_start_position/ { print $2 }' |
687 local D_POS1=$($SHOW_NAMESPACE |
688 awk '/^latest_start_position/ { print $4 }')
690 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
691 [[ $O_POS0 -lt $O_POS1 ]] ||
692 error "(7.1) $O_POS1 is not larger than $O_POS0"
694 [[ $D_POS0 -lt $D_POS1 ]] ||
695 error "(7.2) $D_POS1 is not larger than $D_POS0"
698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
699 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
700 mdd.${MDT_DEV}.lfsck_namespace |
701 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
703 error "(8) unexpected status"
706 run_test 6b "LFSCK resumes from last checkpoint (2)"
713 #define OBD_FAIL_LFSCK_DELAY2 0x1601
714 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
715 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
717 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
718 [ "$STATUS" == "scanning-phase1" ] ||
719 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
721 # Sleep 3 sec to guarantee at least one object processed by LFSCK
723 echo "stop $SINGLEMDS"
724 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
727 echo "start $SINGLEMDS"
728 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
729 error "(5) Fail to start MDS!"
731 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
732 mdd.${MDT_DEV}.lfsck_namespace |
733 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
735 error "(6) unexpected status"
738 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
744 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
745 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
746 for ((i = 0; i < 20; i++)); do
747 touch $DIR/$tdir/dummy${i}
750 #define OBD_FAIL_LFSCK_DELAY3 0x1602
751 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
752 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
753 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
754 mdd.${MDT_DEV}.lfsck_namespace |
755 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
757 error "(4) unexpected status"
761 echo "stop $SINGLEMDS"
762 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
764 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
765 echo "start $SINGLEMDS"
766 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
767 error "(6) Fail to start MDS!"
769 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
770 mdd.${MDT_DEV}.lfsck_namespace |
771 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
773 error "(7) unexpected status"
776 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
781 formatall > /dev/null
787 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
788 [ "$STATUS" == "init" ] ||
789 error "(2) Expect 'init', but got '$STATUS'"
791 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
792 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
793 mkdir $DIR/$tdir/crashed
795 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
797 for ((i = 0; i < 5; i++)); do
798 touch $DIR/$tdir/dummy${i}
801 umount_client $MOUNT || error "(3) Fail to stop client!"
803 #define OBD_FAIL_LFSCK_DELAY2 0x1601
804 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
805 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
807 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
808 [ "$STATUS" == "scanning-phase1" ] ||
809 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
811 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
813 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
814 [ "$STATUS" == "stopped" ] ||
815 error "(7) Expect 'stopped', but got '$STATUS'"
817 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
819 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
820 [ "$STATUS" == "scanning-phase1" ] ||
821 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
823 #define OBD_FAIL_LFSCK_FATAL2 0x1609
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
829 error "(10) unexpected status"
832 #define OBD_FAIL_LFSCK_DELAY1 0x1600
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
834 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
836 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
837 [ "$STATUS" == "scanning-phase1" ] ||
838 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
840 #define OBD_FAIL_LFSCK_CRASH 0x160a
841 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
844 echo "stop $SINGLEMDS"
845 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
847 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
848 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
850 echo "start $SINGLEMDS"
851 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
852 error "(14) Fail to start MDS!"
854 local timeout=$(max_recovery_time)
857 while [ $timer -lt $timeout ]; do
858 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
859 mdt.${MDT_DEV}.recovery_status |
860 awk '/^status/ { print \\\$2 }'")
861 [ "$STATUS" != "RECOVERING" ] && break;
866 [ $timer != $timeout ] ||
867 error "(14.1) recovery timeout"
869 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
870 [ "$STATUS" == "crashed" ] ||
871 error "(15) Expect 'crashed', but got '$STATUS'"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
875 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
881 echo "stop $SINGLEMDS"
882 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
884 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
885 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(19) Fail to start MDS!"
892 while [ $timer -lt $timeout ]; do
893 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
894 mdt.${MDT_DEV}.recovery_status |
895 awk '/^status/ { print \\\$2 }'")
896 [ "$STATUS" != "RECOVERING" ] && break;
901 [ $timer != $timeout ] ||
902 error "(19.1) recovery timeout"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "paused" ] ||
906 error "(20) Expect 'paused', but got '$STATUS'"
908 #define OBD_FAIL_LFSCK_DELAY3 0x1602
909 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
911 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
913 mdd.${MDT_DEV}.lfsck_namespace |
914 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
916 error "(22) unexpected status"
919 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
920 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
921 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
924 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
925 mdd.${MDT_DEV}.lfsck_namespace |
926 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
928 error "(24) unexpected status"
931 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
932 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
934 run_test 8 "LFSCK state machine"
937 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
938 skip "Testing on UP system, the speed may be inaccurate."
943 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
944 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
945 createmany -o $DIR/$tdir/lfsck/f 5000
947 local BASE_SPEED1=100
949 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
952 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
953 [ "$STATUS" == "scanning-phase1" ] ||
954 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
956 local SPEED=$($SHOW_LAYOUT |
957 awk '/^average_speed_phase1/ { print $2 }')
959 # There may be time error, normally it should be less than 2 seconds.
960 # We allow another 20% schedule error.
962 # MAX_MARGIN = 1.2 = 12 / 10
963 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
964 RUN_TIME1 * 12 / 10))
965 [ $SPEED -lt $MAX_SPEED ] ||
966 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
969 local BASE_SPEED2=300
971 do_facet $SINGLEMDS \
972 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
975 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
976 # MIN_MARGIN = 0.8 = 8 / 10
977 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
978 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
979 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
980 [ $SPEED -gt $MIN_SPEED ] || {
981 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
982 error_ignore LU-5624 \
983 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
986 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
990 # MAX_MARGIN = 1.2 = 12 / 10
991 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
992 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
993 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
994 [ $SPEED -lt $MAX_SPEED ] ||
995 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
997 do_facet $SINGLEMDS \
998 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1000 wait_update_facet $SINGLEMDS \
1001 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1002 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1003 error "(7) Failed to get expected 'completed'"
1005 run_test 9a "LFSCK speed control (1)"
1008 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1009 skip "Testing on UP system, the speed may be inaccurate."
1015 echo "Preparing another 50 * 50 files (with error) at $(date)."
1016 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1018 createmany -d $DIR/$tdir/d 50
1019 createmany -m $DIR/$tdir/f 50
1020 for ((i = 0; i < 50; i++)); do
1021 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1024 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1026 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1027 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1028 mdd.${MDT_DEV}.lfsck_namespace |
1029 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1031 error "(5) unexpected status"
1034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1035 echo "Prepared at $(date)."
1037 local BASE_SPEED1=50
1039 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1042 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1043 [ "$STATUS" == "scanning-phase2" ] ||
1044 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1046 local SPEED=$($SHOW_NAMESPACE |
1047 awk '/^average_speed_phase2/ { print $2 }')
1048 # There may be time error, normally it should be less than 2 seconds.
1049 # We allow another 20% schedule error.
1051 # MAX_MARGIN = 1.2 = 12 / 10
1052 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1053 RUN_TIME1 * 12 / 10))
1054 [ $SPEED -lt $MAX_SPEED ] ||
1055 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1057 # adjust speed limit
1058 local BASE_SPEED2=150
1060 do_facet $SINGLEMDS \
1061 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1064 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1065 # MIN_MARGIN = 0.8 = 8 / 10
1066 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1067 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1068 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1069 [ $SPEED -gt $MIN_SPEED ] || {
1070 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1071 error_ignore LU-5624 \
1072 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1075 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1079 # MAX_MARGIN = 1.2 = 12 / 10
1080 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1081 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1082 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1083 [ $SPEED -lt $MAX_SPEED ] ||
1084 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1086 do_facet $SINGLEMDS \
1087 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1088 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1089 mdd.${MDT_DEV}.lfsck_namespace |
1090 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1092 error "(11) unexpected status"
1095 run_test 9b "LFSCK speed control (2)"
1099 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1100 skip "lookup(..)/linkea on ZFS issue" && return
1104 echo "Preparing more files with error at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1108 for ((i = 0; i < 1000; i = $((i+2)))); do
1109 mkdir -p $DIR/$tdir/d${i}
1110 touch $DIR/$tdir/f${i}
1111 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1114 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1115 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1117 for ((i = 1; i < 1000; i = $((i+2)))); do
1118 mkdir -p $DIR/$tdir/d${i}
1119 touch $DIR/$tdir/f${i}
1120 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1128 umount_client $MOUNT
1129 mount_client $MOUNT || error "(3) Fail to start client!"
1131 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1134 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1135 [ "$STATUS" == "scanning-phase1" ] ||
1136 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1138 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1140 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1142 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1144 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1146 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1148 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1150 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1152 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1153 error "(14) Fail to softlink!"
1155 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1156 [ "$STATUS" == "scanning-phase1" ] ||
1157 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1159 do_facet $SINGLEMDS \
1160 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1161 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1162 mdd.${MDT_DEV}.lfsck_namespace |
1163 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1165 error "(16) unexpected status"
1168 run_test 10 "System is available during LFSCK scanning"
1171 ost_remove_lastid() {
1174 local rcmd="do_facet ost${ost}"
1176 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1178 # step 1: local mount
1179 mount_fstype ost${ost} || return 1
1180 # step 2: remove the specified LAST_ID
1181 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1183 unmount_fstype ost${ost} || return 2
1187 check_mount_and_prep
1188 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1189 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1194 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1196 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1197 error "(2) Fail to start ost1"
1199 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1200 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1202 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1203 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1205 wait_update_facet ost1 "$LCTL get_param -n \
1206 obdfilter.${OST_DEV}.lfsck_layout |
1207 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1209 error "(5) unexpected status"
1212 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1214 wait_update_facet ost1 "$LCTL get_param -n \
1215 obdfilter.${OST_DEV}.lfsck_layout |
1216 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1218 error "(6) unexpected status"
1221 echo "the LAST_ID(s) should have been rebuilt"
1222 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1223 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1225 run_test 11a "LFSCK can rebuild lost last_id"
1228 check_mount_and_prep
1229 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1231 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1232 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1233 do_facet ost1 $LCTL set_param fail_loc=0x160d
1235 local count=$(precreated_ost_obj_count 0 0)
1237 createmany -o $DIR/$tdir/f $((count + 32))
1239 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1240 local seq=$(do_facet mds1 $LCTL get_param -n \
1241 osp.${proc_path}.prealloc_last_seq)
1242 local lastid1=$(do_facet ost1 "lctl get_param -n \
1243 obdfilter.${ost1_svc}.last_id" | grep $seq |
1244 awk -F: '{ print $2 }')
1246 umount_client $MOUNT
1247 stop ost1 || error "(1) Fail to stop ost1"
1249 #define OBD_FAIL_OST_ENOSPC 0x215
1250 do_facet ost1 $LCTL set_param fail_loc=0x215
1252 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1253 error "(2) Fail to start ost1"
1255 for ((i = 0; i < 60; i++)); do
1256 lastid2=$(do_facet ost1 "lctl get_param -n \
1257 obdfilter.${ost1_svc}.last_id" | grep $seq |
1258 awk -F: '{ print $2 }')
1259 [ ! -z $lastid2 ] && break;
1263 echo "the on-disk LAST_ID should be smaller than the expected one"
1264 [ $lastid1 -gt $lastid2 ] ||
1265 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1267 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1268 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1270 wait_update_facet ost1 "$LCTL get_param -n \
1271 obdfilter.${OST_DEV}.lfsck_layout |
1272 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1274 error "(6) unexpected status"
1277 stop ost1 || error "(7) Fail to stop ost1"
1279 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1280 error "(8) Fail to start ost1"
1282 echo "the on-disk LAST_ID should have been rebuilt"
1283 wait_update_facet ost1 "$LCTL get_param -n \
1284 obdfilter.${ost1_svc}.last_id | grep $seq |
1285 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1286 do_facet ost1 $LCTL get_param -n \
1287 obdfilter.${ost1_svc}.last_id
1288 error "(9) expect lastid1 $seq:$lastid1"
1291 do_facet ost1 $LCTL set_param fail_loc=0
1292 stopall || error "(10) Fail to stopall"
1294 run_test 11b "LFSCK can rebuild crashed last_id"
1297 [ $MDSCOUNT -lt 2 ] &&
1298 skip "We need at least 2 MDSes for test_12" && return
1300 check_mount_and_prep
1301 for k in $(seq $MDSCOUNT); do
1302 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1303 createmany -o $DIR/$tdir/${k}/f 100 ||
1304 error "(0) Fail to create 100 files."
1307 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1308 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1309 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1311 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1312 for k in $(seq $MDSCOUNT); do
1313 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1314 mdd.$(facet_svc mds${k}).lfsck_namespace |
1315 awk '/^status/ { print $2 }')
1316 [ "$STATUS" == "scanning-phase1" ] ||
1317 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1320 echo "Stop namespace LFSCK on all targets by single lctl command."
1321 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1322 error "(4) Fail to stop LFSCK on all devices!"
1324 echo "All the LFSCK targets should be in 'stopped' status."
1325 for k in $(seq $MDSCOUNT); do
1326 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1327 mdd.$(facet_svc mds${k}).lfsck_namespace |
1328 awk '/^status/ { print $2 }')
1329 [ "$STATUS" == "stopped" ] ||
1330 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1333 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1334 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1335 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1337 echo "All the LFSCK targets should be in 'completed' status."
1338 for k in $(seq $MDSCOUNT); do
1339 wait_update_facet mds${k} "$LCTL get_param -n \
1340 mdd.$(facet_svc mds${k}).lfsck_namespace |
1341 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1342 error "(7) MDS${k} is not the expected 'completed'"
1345 start_full_debug_logging
1347 echo "Start layout LFSCK on all targets by single command (-s 1)."
1348 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1349 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1351 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1352 for k in $(seq $MDSCOUNT); do
1353 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1354 mdd.$(facet_svc mds${k}).lfsck_layout |
1355 awk '/^status/ { print $2 }')
1356 [ "$STATUS" == "scanning-phase1" ] ||
1357 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1360 echo "Stop layout LFSCK on all targets by single lctl command."
1361 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1362 error "(10) Fail to stop LFSCK on all devices!"
1364 echo "All the LFSCK targets should be in 'stopped' status."
1365 for k in $(seq $MDSCOUNT); do
1366 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1367 mdd.$(facet_svc mds${k}).lfsck_layout |
1368 awk '/^status/ { print $2 }')
1369 [ "$STATUS" == "stopped" ] ||
1370 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1373 for k in $(seq $OSTCOUNT); do
1374 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1375 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1376 awk '/^status/ { print $2 }')
1377 [ "$STATUS" == "stopped" ] ||
1378 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1381 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1382 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1383 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1385 echo "All the LFSCK targets should be in 'completed' status."
1386 for k in $(seq $MDSCOUNT); do
1387 # The LFSCK status query internal is 30 seconds. For the case
1388 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1389 # time to guarantee the status sync up.
1390 wait_update_facet mds${k} "$LCTL get_param -n \
1391 mdd.$(facet_svc mds${k}).lfsck_layout |
1392 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1393 error "(14) MDS${k} is not the expected 'completed'"
1396 stop_full_debug_logging
1398 run_test 12 "single command to trigger LFSCK on all devices"
1402 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1403 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1404 echo "MDT-object FID."
1407 check_mount_and_prep
1409 echo "Inject failure stub to simulate bad lmm_oi"
1410 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1412 createmany -o $DIR/$tdir/f 32
1413 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1415 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1416 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1418 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1419 mdd.${MDT_DEV}.lfsck_layout |
1420 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1422 error "(2) unexpected status"
1425 local repaired=$($SHOW_LAYOUT |
1426 awk '/^repaired_others/ { print $2 }')
1427 [ $repaired -eq 32 ] ||
1428 error "(3) Fail to repair crashed lmm_oi: $repaired"
1430 run_test 13 "LFSCK can repair crashed lmm_oi"
1434 echo "The OST-object referenced by the MDT-object should be there;"
1435 echo "otherwise, the LFSCK should re-create the missing OST-object."
1438 check_mount_and_prep
1439 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1441 echo "Inject failure stub to simulate dangling referenced MDT-object"
1442 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1443 do_facet ost1 $LCTL set_param fail_loc=0x1610
1444 local count=$(precreated_ost_obj_count 0 0)
1446 createmany -o $DIR/$tdir/f $((count + 31))
1447 touch $DIR/$tdir/guard
1448 do_facet ost1 $LCTL set_param fail_loc=0
1450 start_full_debug_logging
1452 # exhaust other pre-created dangling cases
1453 count=$(precreated_ost_obj_count 0 0)
1454 createmany -o $DIR/$tdir/a $count ||
1455 error "(0) Fail to create $count files."
1457 echo "'ls' should fail because of dangling referenced MDT-object"
1458 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1460 echo "Trigger layout LFSCK to find out dangling reference"
1461 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1463 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1464 mdd.${MDT_DEV}.lfsck_layout |
1465 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1467 error "(3) unexpected status"
1470 local repaired=$($SHOW_LAYOUT |
1471 awk '/^repaired_dangling/ { print $2 }')
1472 [ $repaired -ge 32 ] ||
1473 error "(4) Fail to repair dangling reference: $repaired"
1475 echo "'stat' should fail because of not repair dangling by default"
1476 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1478 echo "Trigger layout LFSCK to repair dangling reference"
1479 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1481 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1482 mdd.${MDT_DEV}.lfsck_layout |
1483 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1485 error "(7) unexpected status"
1488 # There may be some async LFSCK updates in processing, wait for
1489 # a while until the target reparation has been done. LU-4970.
1491 echo "'stat' should success after layout LFSCK repairing"
1492 wait_update_facet client "stat $DIR/$tdir/guard |
1493 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1494 stat $DIR/$tdir/guard
1496 error "(8) unexpected size"
1499 repaired=$($SHOW_LAYOUT |
1500 awk '/^repaired_dangling/ { print $2 }')
1501 [ $repaired -ge 32 ] ||
1502 error "(9) Fail to repair dangling reference: $repaired"
1504 stop_full_debug_logging
1506 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1510 echo "If the OST-object referenced by the MDT-object back points"
1511 echo "to some non-exist MDT-object, then the LFSCK should repair"
1512 echo "the OST-object to back point to the right MDT-object."
1515 check_mount_and_prep
1516 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1518 echo "Inject failure stub to make the OST-object to back point to"
1519 echo "non-exist MDT-object."
1520 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1522 do_facet ost1 $LCTL set_param fail_loc=0x1611
1523 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1524 cancel_lru_locks osc
1525 do_facet ost1 $LCTL set_param fail_loc=0
1527 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1528 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1530 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1531 mdd.${MDT_DEV}.lfsck_layout |
1532 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1534 error "(2) unexpected status"
1537 local repaired=$($SHOW_LAYOUT |
1538 awk '/^repaired_unmatched_pair/ { print $2 }')
1539 [ $repaired -eq 1 ] ||
1540 error "(3) Fail to repair unmatched pair: $repaired"
1542 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1546 echo "If the OST-object referenced by the MDT-object back points"
1547 echo "to other MDT-object that doesn't recognize the OST-object,"
1548 echo "then the LFSCK should repair it to back point to the right"
1549 echo "MDT-object (the first one)."
1552 check_mount_and_prep
1553 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1554 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1555 cancel_lru_locks osc
1557 echo "Inject failure stub to make the OST-object to back point to"
1558 echo "other MDT-object"
1560 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1561 do_facet ost1 $LCTL set_param fail_loc=0x1612
1562 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1563 cancel_lru_locks osc
1564 do_facet ost1 $LCTL set_param fail_loc=0
1566 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1567 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1569 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1570 mdd.${MDT_DEV}.lfsck_layout |
1571 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1573 error "(2) unexpected status"
1576 local repaired=$($SHOW_LAYOUT |
1577 awk '/^repaired_unmatched_pair/ { print $2 }')
1578 [ $repaired -eq 1 ] ||
1579 error "(3) Fail to repair unmatched pair: $repaired"
1581 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1584 [ $MDSCOUNT -lt 2 ] &&
1585 skip "We need at least 2 MDSes for this test" && return
1588 echo "According to current metadata migration implementation,"
1589 echo "before the old MDT-object is removed, both the new MDT-object"
1590 echo "and old MDT-object will reference the same LOV layout. Then if"
1591 echo "the layout LFSCK finds the new MDT-object by race, it will"
1592 echo "regard related OST-object(s) as multiple referenced case, and"
1593 echo "will try to create new OST-object(s) for the new MDT-object."
1594 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1595 echo "MDT-object before confirm the multiple referenced case."
1598 check_mount_and_prep
1599 $LFS mkdir -i 1 $DIR/$tdir/a1
1600 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1601 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1602 cancel_lru_locks osc
1604 echo "Inject failure stub on MDT1 to delay the migration"
1606 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1607 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1608 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1609 $LFS mv -M 0 $DIR/$tdir/a1 &
1612 echo "Trigger layout LFSCK to race with the migration"
1613 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1615 for k in $(seq $MDSCOUNT); do
1616 # The LFSCK status query internal is 30 seconds. For the case
1617 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1618 # time to guarantee the status sync up.
1619 wait_update_facet mds${k} "$LCTL get_param -n \
1620 mdd.$(facet_svc mds${k}).lfsck_layout |
1621 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1622 error "(2) MDS${k} is not the expected 'completed'"
1625 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1626 local repaired=$($SHOW_LAYOUT |
1627 awk '/^repaired_unmatched_pair/ { print $2 }')
1628 [ $repaired -eq 1 ] ||
1629 error "(3) Fail to repair unmatched pair: $repaired"
1631 repaired=$($SHOW_LAYOUT |
1632 awk '/^repaired_multiple_referenced/ { print $2 }')
1633 [ $repaired -eq 0 ] ||
1634 error "(4) Unexpectedly repaird multiple references: $repaired"
1636 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1640 echo "If the OST-object's owner information does not match the owner"
1641 echo "information stored in the MDT-object, then the LFSCK trust the"
1642 echo "MDT-object and update the OST-object's owner information."
1645 check_mount_and_prep
1646 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1647 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1648 cancel_lru_locks osc
1650 echo "Inject failure stub to skip OST-object owner changing"
1651 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1652 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1653 chown 1.1 $DIR/$tdir/f0
1654 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1656 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1659 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1661 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1662 mdd.${MDT_DEV}.lfsck_layout |
1663 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1665 error "(2) unexpected status"
1668 local repaired=$($SHOW_LAYOUT |
1669 awk '/^repaired_inconsistent_owner/ { print $2 }')
1670 [ $repaired -eq 1 ] ||
1671 error "(3) Fail to repair inconsistent owner: $repaired"
1673 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1677 echo "If more than one MDT-objects reference the same OST-object,"
1678 echo "and the OST-object only recognizes one MDT-object, then the"
1679 echo "LFSCK should create new OST-objects for such non-recognized"
1683 check_mount_and_prep
1684 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1686 echo "Inject failure stub to make two MDT-objects to refernce"
1687 echo "the OST-object"
1689 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1690 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1692 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1693 cancel_lru_locks osc
1695 createmany -o $DIR/$tdir/f 1
1697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1699 cancel_lru_locks mdc
1700 cancel_lru_locks osc
1702 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1703 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1704 [ $size -eq 1048576 ] ||
1705 error "(1) f0 (wrong) size should be 1048576, but got $size"
1707 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1710 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1712 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1713 mdd.${MDT_DEV}.lfsck_layout |
1714 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1716 error "(3) unexpected status"
1719 local repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_multiple_referenced/ { print $2 }')
1721 [ $repaired -eq 1 ] ||
1722 error "(4) Fail to repair multiple references: $repaired"
1724 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1725 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1726 error "(5) Fail to write f0."
1727 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1728 [ $size -eq 1048576 ] ||
1729 error "(6) guard size should be 1048576, but got $size"
1731 run_test 17 "LFSCK can repair multiple references"
1733 $LCTL set_param debug=+cache > /dev/null
1737 echo "The target MDT-object is there, but related stripe information"
1738 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1739 echo "layout EA entries."
1742 check_mount_and_prep
1743 $LFS mkdir -i 0 $DIR/$tdir/a1
1744 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1745 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1747 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1749 $LFS path2fid $DIR/$tdir/a1/f1
1750 $LFS getstripe $DIR/$tdir/a1/f1
1752 if [ $MDSCOUNT -ge 2 ]; then
1753 $LFS mkdir -i 1 $DIR/$tdir/a2
1754 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1755 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1756 $LFS path2fid $DIR/$tdir/a2/f2
1757 $LFS getstripe $DIR/$tdir/a2/f2
1760 cancel_lru_locks osc
1762 echo "Inject failure, to make the MDT-object lost its layout EA"
1763 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1764 do_facet mds1 $LCTL set_param fail_loc=0x1615
1765 chown 1.1 $DIR/$tdir/a1/f1
1767 if [ $MDSCOUNT -ge 2 ]; then
1768 do_facet mds2 $LCTL set_param fail_loc=0x1615
1769 chown 1.1 $DIR/$tdir/a2/f2
1775 do_facet mds1 $LCTL set_param fail_loc=0
1776 if [ $MDSCOUNT -ge 2 ]; then
1777 do_facet mds2 $LCTL set_param fail_loc=0
1780 cancel_lru_locks mdc
1781 cancel_lru_locks osc
1783 echo "The file size should be incorrect since layout EA is lost"
1784 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1785 [ "$cur_size" != "$saved_size" ] ||
1786 error "(1) Expect incorrect file1 size"
1788 if [ $MDSCOUNT -ge 2 ]; then
1789 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1790 [ "$cur_size" != "$saved_size" ] ||
1791 error "(2) Expect incorrect file2 size"
1794 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1795 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1797 for k in $(seq $MDSCOUNT); do
1798 # The LFSCK status query internal is 30 seconds. For the case
1799 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1800 # time to guarantee the status sync up.
1801 wait_update_facet mds${k} "$LCTL get_param -n \
1802 mdd.$(facet_svc mds${k}).lfsck_layout |
1803 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1804 error "(4) MDS${k} is not the expected 'completed'"
1807 for k in $(seq $OSTCOUNT); do
1808 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1809 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1810 awk '/^status/ { print $2 }')
1811 [ "$cur_status" == "completed" ] ||
1812 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1815 local repaired=$(do_facet mds1 $LCTL get_param -n \
1816 mdd.$(facet_svc mds1).lfsck_layout |
1817 awk '/^repaired_orphan/ { print $2 }')
1818 [ $repaired -eq 1 ] ||
1819 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1821 if [ $MDSCOUNT -ge 2 ]; then
1822 repaired=$(do_facet mds2 $LCTL get_param -n \
1823 mdd.$(facet_svc mds2).lfsck_layout |
1824 awk '/^repaired_orphan/ { print $2 }')
1825 [ $repaired -eq 2 ] ||
1826 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1829 $LFS path2fid $DIR/$tdir/a1/f1
1830 $LFS getstripe $DIR/$tdir/a1/f1
1832 if [ $MDSCOUNT -ge 2 ]; then
1833 $LFS path2fid $DIR/$tdir/a2/f2
1834 $LFS getstripe $DIR/$tdir/a2/f2
1837 echo "The file size should be correct after layout LFSCK scanning"
1838 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1839 [ "$cur_size" == "$saved_size" ] ||
1840 error "(7) Expect file1 size $saved_size, but got $cur_size"
1842 if [ $MDSCOUNT -ge 2 ]; then
1843 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1844 [ "$cur_size" == "$saved_size" ] ||
1845 error "(8) Expect file2 size $saved_size, but got $cur_size"
1848 run_test 18a "Find out orphan OST-object and repair it (1)"
1852 echo "The target MDT-object is lost. The LFSCK should re-create the"
1853 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1854 echo "can move it back to normal namespace manually."
1857 check_mount_and_prep
1858 $LFS mkdir -i 0 $DIR/$tdir/a1
1859 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1860 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1861 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1862 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1864 $LFS getstripe $DIR/$tdir/a1/f1
1866 if [ $MDSCOUNT -ge 2 ]; then
1867 $LFS mkdir -i 1 $DIR/$tdir/a2
1868 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1869 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1870 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1872 $LFS getstripe $DIR/$tdir/a2/f2
1875 cancel_lru_locks osc
1877 echo "Inject failure, to simulate the case of missing the MDT-object"
1878 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1879 do_facet mds1 $LCTL set_param fail_loc=0x1616
1880 rm -f $DIR/$tdir/a1/f1
1882 if [ $MDSCOUNT -ge 2 ]; then
1883 do_facet mds2 $LCTL set_param fail_loc=0x1616
1884 rm -f $DIR/$tdir/a2/f2
1890 do_facet mds1 $LCTL set_param fail_loc=0
1891 if [ $MDSCOUNT -ge 2 ]; then
1892 do_facet mds2 $LCTL set_param fail_loc=0
1895 cancel_lru_locks mdc
1896 cancel_lru_locks osc
1898 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1899 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1901 for k in $(seq $MDSCOUNT); do
1902 # The LFSCK status query internal is 30 seconds. For the case
1903 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1904 # time to guarantee the status sync up.
1905 wait_update_facet mds${k} "$LCTL get_param -n \
1906 mdd.$(facet_svc mds${k}).lfsck_layout |
1907 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1908 error "(2) MDS${k} is not the expected 'completed'"
1911 for k in $(seq $OSTCOUNT); do
1912 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1913 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1914 awk '/^status/ { print $2 }')
1915 [ "$cur_status" == "completed" ] ||
1916 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1919 local repaired=$(do_facet mds1 $LCTL get_param -n \
1920 mdd.$(facet_svc mds1).lfsck_layout |
1921 awk '/^repaired_orphan/ { print $2 }')
1922 [ $repaired -eq 1 ] ||
1923 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1925 if [ $MDSCOUNT -ge 2 ]; then
1926 repaired=$(do_facet mds2 $LCTL get_param -n \
1927 mdd.$(facet_svc mds2).lfsck_layout |
1928 awk '/^repaired_orphan/ { print $2 }')
1929 [ $repaired -eq 2 ] ||
1930 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1933 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1934 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1935 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1937 if [ $MDSCOUNT -ge 2 ]; then
1938 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1939 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1942 $LFS path2fid $DIR/$tdir/a1/f1
1943 $LFS getstripe $DIR/$tdir/a1/f1
1945 if [ $MDSCOUNT -ge 2 ]; then
1946 $LFS path2fid $DIR/$tdir/a2/f2
1947 $LFS getstripe $DIR/$tdir/a2/f2
1950 echo "The file size should be correct after layout LFSCK scanning"
1951 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1952 [ "$cur_size" == "$saved_size" ] ||
1953 error "(7) Expect file1 size $saved_size, but got $cur_size"
1955 if [ $MDSCOUNT -ge 2 ]; then
1956 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1957 [ "$cur_size" == "$saved_size" ] ||
1958 error "(8) Expect file2 size $saved_size, but got $cur_size"
1961 run_test 18b "Find out orphan OST-object and repair it (2)"
1965 echo "The target MDT-object is lost, and the OST-object FID is missing."
1966 echo "The LFSCK should re-create the MDT-object with new FID under the "
1967 echo "directory .lustre/lost+found/MDTxxxx."
1970 check_mount_and_prep
1971 $LFS mkdir -i 0 $DIR/$tdir/a1
1972 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1974 echo "Inject failure, to simulate the case of missing parent FID"
1975 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1976 do_facet ost1 $LCTL set_param fail_loc=0x1617
1978 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1979 $LFS getstripe $DIR/$tdir/a1/f1
1981 if [ $MDSCOUNT -ge 2 ]; then
1982 $LFS mkdir -i 1 $DIR/$tdir/a2
1983 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1984 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1985 $LFS getstripe $DIR/$tdir/a2/f2
1988 cancel_lru_locks osc
1990 echo "Inject failure, to simulate the case of missing the MDT-object"
1991 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1992 do_facet mds1 $LCTL set_param fail_loc=0x1616
1993 rm -f $DIR/$tdir/a1/f1
1995 if [ $MDSCOUNT -ge 2 ]; then
1996 do_facet mds2 $LCTL set_param fail_loc=0x1616
1997 rm -f $DIR/$tdir/a2/f2
2003 do_facet mds1 $LCTL set_param fail_loc=0
2004 if [ $MDSCOUNT -ge 2 ]; then
2005 do_facet mds2 $LCTL set_param fail_loc=0
2008 cancel_lru_locks mdc
2009 cancel_lru_locks osc
2011 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2012 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2014 for k in $(seq $MDSCOUNT); do
2015 # The LFSCK status query internal is 30 seconds. For the case
2016 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2017 # time to guarantee the status sync up.
2018 wait_update_facet mds${k} "$LCTL get_param -n \
2019 mdd.$(facet_svc mds${k}).lfsck_layout |
2020 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2021 error "(2) MDS${k} is not the expected 'completed'"
2024 for k in $(seq $OSTCOUNT); do
2025 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2026 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2027 awk '/^status/ { print $2 }')
2028 [ "$cur_status" == "completed" ] ||
2029 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2032 if [ $MDSCOUNT -ge 2 ]; then
2038 local repaired=$(do_facet mds1 $LCTL get_param -n \
2039 mdd.$(facet_svc mds1).lfsck_layout |
2040 awk '/^repaired_orphan/ { print $2 }')
2041 [ $repaired -eq $expected ] ||
2042 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2044 if [ $MDSCOUNT -ge 2 ]; then
2045 repaired=$(do_facet mds2 $LCTL get_param -n \
2046 mdd.$(facet_svc mds2).lfsck_layout |
2047 awk '/^repaired_orphan/ { print $2 }')
2048 [ $repaired -eq 0 ] ||
2049 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2052 ls -ail $MOUNT/.lustre/lost+found/
2054 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2055 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2056 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2058 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2061 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2062 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2063 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2065 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2066 [ ! -z "$cname" ] ||
2067 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2069 run_test 18c "Find out orphan OST-object and repair it (3)"
2073 echo "The target MDT-object layout EA slot is occpuied by some new"
2074 echo "created OST-object when repair dangling reference case. Such"
2075 echo "conflict OST-object has never been modified. Then when found"
2076 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2080 check_mount_and_prep
2082 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2083 echo "guard" > $DIR/$tdir/a1/f1
2084 echo "foo" > $DIR/$tdir/a1/f2
2085 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2086 $LFS path2fid $DIR/$tdir/a1/f1
2087 $LFS getstripe $DIR/$tdir/a1/f1
2088 $LFS path2fid $DIR/$tdir/a1/f2
2089 $LFS getstripe $DIR/$tdir/a1/f2
2090 cancel_lru_locks osc
2092 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2093 echo "to reference the same OST-object (which is f1's OST-obejct)."
2094 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2095 echo "dangling reference case, but f2's old OST-object is there."
2098 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2100 chown 1.1 $DIR/$tdir/a1/f2
2101 rm -f $DIR/$tdir/a1/f1
2104 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2106 echo "stopall to cleanup object cache"
2109 setupall > /dev/null
2111 echo "The file size should be incorrect since dangling referenced"
2112 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2113 [ "$cur_size" != "$saved_size" ] ||
2114 error "(1) Expect incorrect file2 size"
2116 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2117 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2119 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2120 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2122 wait_update_facet mds1 "$LCTL get_param -n \
2123 mdd.$(facet_svc mds1).lfsck_layout |
2124 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2125 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2127 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2129 for k in $(seq $MDSCOUNT); do
2130 # The LFSCK status query internal is 30 seconds. For the case
2131 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2132 # time to guarantee the status sync up.
2133 wait_update_facet mds${k} "$LCTL get_param -n \
2134 mdd.$(facet_svc mds${k}).lfsck_layout |
2135 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2136 error "(3) MDS${k} is not the expected 'completed'"
2139 for k in $(seq $OSTCOUNT); do
2140 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2141 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2142 awk '/^status/ { print $2 }')
2143 [ "$cur_status" == "completed" ] ||
2144 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2147 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2148 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2149 awk '/^repaired_orphan/ { print $2 }')
2150 [ $repaired -eq 1 ] ||
2151 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2153 echo "The file size should be correct after layout LFSCK scanning"
2154 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2155 [ "$cur_size" == "$saved_size" ] ||
2156 error "(6) Expect file2 size $saved_size, but got $cur_size"
2158 echo "The LFSCK should find back the original data."
2159 cat $DIR/$tdir/a1/f2
2160 $LFS path2fid $DIR/$tdir/a1/f2
2161 $LFS getstripe $DIR/$tdir/a1/f2
2163 run_test 18d "Find out orphan OST-object and repair it (4)"
2167 echo "The target MDT-object layout EA slot is occpuied by some new"
2168 echo "created OST-object when repair dangling reference case. Such"
2169 echo "conflict OST-object has been modified by others. To keep the"
2170 echo "new data, the LFSCK will create a new file to refernece this"
2171 echo "old orphan OST-object."
2174 check_mount_and_prep
2176 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2177 echo "guard" > $DIR/$tdir/a1/f1
2178 echo "foo" > $DIR/$tdir/a1/f2
2179 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2180 $LFS path2fid $DIR/$tdir/a1/f1
2181 $LFS getstripe $DIR/$tdir/a1/f1
2182 $LFS path2fid $DIR/$tdir/a1/f2
2183 $LFS getstripe $DIR/$tdir/a1/f2
2184 cancel_lru_locks osc
2186 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2187 echo "to reference the same OST-object (which is f1's OST-obejct)."
2188 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2189 echo "dangling reference case, but f2's old OST-object is there."
2192 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2193 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2194 chown 1.1 $DIR/$tdir/a1/f2
2195 rm -f $DIR/$tdir/a1/f1
2198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2200 echo "stopall to cleanup object cache"
2203 setupall > /dev/null
2205 echo "The file size should be incorrect since dangling referenced"
2206 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2207 [ "$cur_size" != "$saved_size" ] ||
2208 error "(1) Expect incorrect file2 size"
2210 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2211 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2213 start_full_debug_logging
2215 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2216 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2218 wait_update_facet mds1 "$LCTL get_param -n \
2219 mdd.$(facet_svc mds1).lfsck_layout |
2220 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2221 error "(3) MDS1 is not the expected 'scanning-phase2'"
2223 # to guarantee all updates are synced.
2227 echo "Write new data to f2 to modify the new created OST-object."
2228 echo "dummy" >> $DIR/$tdir/a1/f2
2230 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2232 for k in $(seq $MDSCOUNT); do
2233 # The LFSCK status query internal is 30 seconds. For the case
2234 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2235 # time to guarantee the status sync up.
2236 wait_update_facet mds${k} "$LCTL get_param -n \
2237 mdd.$(facet_svc mds${k}).lfsck_layout |
2238 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2239 error "(4) MDS${k} is not the expected 'completed'"
2242 for k in $(seq $OSTCOUNT); do
2243 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2244 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2245 awk '/^status/ { print $2 }')
2246 [ "$cur_status" == "completed" ] ||
2247 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2250 stop_full_debug_logging
2252 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2253 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2254 awk '/^repaired_orphan/ { print $2 }')
2255 [ $repaired -eq 1 ] ||
2256 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2258 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2259 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2260 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2262 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2263 [ ! -z "$cname" ] ||
2264 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2266 echo "The stub file should keep the original f2 data"
2267 cur_size=$(ls -il $cname | awk '{ print $6 }')
2268 [ "$cur_size" == "$saved_size" ] ||
2269 error "(9) Expect file2 size $saved_size, but got $cur_size"
2272 $LFS path2fid $cname
2273 $LFS getstripe $cname
2275 echo "The f2 should contains new data."
2276 cat $DIR/$tdir/a1/f2
2277 $LFS path2fid $DIR/$tdir/a1/f2
2278 $LFS getstripe $DIR/$tdir/a1/f2
2280 run_test 18e "Find out orphan OST-object and repair it (5)"
2283 [ $OSTCOUNT -lt 2 ] &&
2284 skip "The test needs at least 2 OSTs" && return
2287 echo "The target MDT-object is lost. The LFSCK should re-create the"
2288 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2289 echo "to verify some OST-object(s) during the first stage-scanning,"
2290 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2291 echo "should not be affected."
2294 check_mount_and_prep
2295 $LFS mkdir -i 0 $DIR/$tdir/a1
2296 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2297 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2298 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2299 $LFS mkdir -i 0 $DIR/$tdir/a2
2300 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2301 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2302 $LFS getstripe $DIR/$tdir/a1/f1
2303 $LFS getstripe $DIR/$tdir/a2/f2
2305 if [ $MDSCOUNT -ge 2 ]; then
2306 $LFS mkdir -i 1 $DIR/$tdir/a3
2307 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2308 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2309 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2310 $LFS mkdir -i 1 $DIR/$tdir/a4
2311 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2312 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2313 $LFS getstripe $DIR/$tdir/a3/f3
2314 $LFS getstripe $DIR/$tdir/a4/f4
2317 cancel_lru_locks osc
2319 echo "Inject failure, to simulate the case of missing the MDT-object"
2320 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2321 do_facet mds1 $LCTL set_param fail_loc=0x1616
2322 rm -f $DIR/$tdir/a1/f1
2323 rm -f $DIR/$tdir/a2/f2
2325 if [ $MDSCOUNT -ge 2 ]; then
2326 do_facet mds2 $LCTL set_param fail_loc=0x1616
2327 rm -f $DIR/$tdir/a3/f3
2328 rm -f $DIR/$tdir/a4/f4
2334 do_facet mds1 $LCTL set_param fail_loc=0
2335 if [ $MDSCOUNT -ge 2 ]; then
2336 do_facet mds2 $LCTL set_param fail_loc=0
2339 cancel_lru_locks mdc
2340 cancel_lru_locks osc
2342 echo "Inject failure, to simulate the OST0 fail to handle"
2343 echo "MDT0 LFSCK request during the first-stage scanning."
2344 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2345 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2347 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2348 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2350 for k in $(seq $MDSCOUNT); do
2351 # The LFSCK status query internal is 30 seconds. For the case
2352 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2353 # time to guarantee the status sync up.
2354 wait_update_facet mds${k} "$LCTL get_param -n \
2355 mdd.$(facet_svc mds${k}).lfsck_layout |
2356 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2357 error "(2) MDS${k} is not the expected 'partial'"
2360 wait_update_facet ost1 "$LCTL get_param -n \
2361 obdfilter.$(facet_svc ost1).lfsck_layout |
2362 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2363 error "(3) OST1 is not the expected 'partial'"
2366 wait_update_facet ost2 "$LCTL get_param -n \
2367 obdfilter.$(facet_svc ost2).lfsck_layout |
2368 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2369 error "(4) OST2 is not the expected 'completed'"
2372 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2374 local repaired=$(do_facet mds1 $LCTL get_param -n \
2375 mdd.$(facet_svc mds1).lfsck_layout |
2376 awk '/^repaired_orphan/ { print $2 }')
2377 [ $repaired -eq 1 ] ||
2378 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2380 if [ $MDSCOUNT -ge 2 ]; then
2381 repaired=$(do_facet mds2 $LCTL get_param -n \
2382 mdd.$(facet_svc mds2).lfsck_layout |
2383 awk '/^repaired_orphan/ { print $2 }')
2384 [ $repaired -eq 1 ] ||
2385 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2388 echo "Trigger layout LFSCK on all devices again to cleanup"
2389 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2391 for k in $(seq $MDSCOUNT); do
2392 # The LFSCK status query internal is 30 seconds. For the case
2393 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2394 # time to guarantee the status sync up.
2395 wait_update_facet mds${k} "$LCTL get_param -n \
2396 mdd.$(facet_svc mds${k}).lfsck_layout |
2397 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2398 error "(8) MDS${k} is not the expected 'completed'"
2401 for k in $(seq $OSTCOUNT); do
2402 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2403 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2404 awk '/^status/ { print $2 }')
2405 [ "$cur_status" == "completed" ] ||
2406 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2410 local repaired=$(do_facet mds1 $LCTL get_param -n \
2411 mdd.$(facet_svc mds1).lfsck_layout |
2412 awk '/^repaired_orphan/ { print $2 }')
2413 [ $repaired -eq 2 ] ||
2414 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2416 if [ $MDSCOUNT -ge 2 ]; then
2417 repaired=$(do_facet mds2 $LCTL get_param -n \
2418 mdd.$(facet_svc mds2).lfsck_layout |
2419 awk '/^repaired_orphan/ { print $2 }')
2420 [ $repaired -eq 2 ] ||
2421 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2424 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2426 $LCTL set_param debug=-cache > /dev/null
2429 check_mount_and_prep
2430 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2432 echo "foo" > $DIR/$tdir/a0
2433 echo "guard" > $DIR/$tdir/a1
2434 cancel_lru_locks osc
2436 echo "Inject failure, then client will offer wrong parent FID when read"
2437 do_facet ost1 $LCTL set_param -n \
2438 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2439 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2440 $LCTL set_param fail_loc=0x1619
2442 echo "Read RPC with wrong parent FID should be denied"
2443 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2444 $LCTL set_param fail_loc=0
2446 run_test 19a "OST-object inconsistency self detect"
2449 check_mount_and_prep
2450 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2452 echo "Inject failure stub to make the OST-object to back point to"
2453 echo "non-exist MDT-object"
2455 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2456 do_facet ost1 $LCTL set_param fail_loc=0x1611
2457 echo "foo" > $DIR/$tdir/f0
2458 cancel_lru_locks osc
2459 do_facet ost1 $LCTL set_param fail_loc=0
2461 echo "Nothing should be fixed since self detect and repair is disabled"
2462 local repaired=$(do_facet ost1 $LCTL get_param -n \
2463 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2464 awk '/^repaired/ { print $2 }')
2465 [ $repaired -eq 0 ] ||
2466 error "(1) Expected 0 repaired, but got $repaired"
2468 echo "Read RPC with right parent FID should be accepted,"
2469 echo "and cause parent FID on OST to be fixed"
2471 do_facet ost1 $LCTL set_param -n \
2472 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2473 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2475 repaired=$(do_facet ost1 $LCTL get_param -n \
2476 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2477 awk '/^repaired/ { print $2 }')
2478 [ $repaired -eq 1 ] ||
2479 error "(3) Expected 1 repaired, but got $repaired"
2481 run_test 19b "OST-object inconsistency self repair"
2484 [ $OSTCOUNT -lt 2 ] &&
2485 skip "The test needs at least 2 OSTs" && return
2488 echo "The target MDT-object and some of its OST-object are lost."
2489 echo "The LFSCK should find out the left OST-objects and re-create"
2490 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2491 echo "with the partial OST-objects (LOV EA hole)."
2493 echo "New client can access the file with LOV EA hole via normal"
2494 echo "system tools or commands without crash the system."
2496 echo "For old client, even though it cannot access the file with"
2497 echo "LOV EA hole, it should not cause the system crash."
2500 check_mount_and_prep
2501 $LFS mkdir -i 0 $DIR/$tdir/a1
2502 if [ $OSTCOUNT -gt 2 ]; then
2503 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2506 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2510 # 256 blocks on the stripe0.
2511 # 1 block on the stripe1 for 2 OSTs case.
2512 # 256 blocks on the stripe1 for other cases.
2513 # 1 block on the stripe2 if OSTs > 2
2514 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2515 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2516 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2518 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2519 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2520 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2523 $LFS getstripe $DIR/$tdir/a1/f0
2525 $LFS getstripe $DIR/$tdir/a1/f1
2527 $LFS getstripe $DIR/$tdir/a1/f2
2529 if [ $OSTCOUNT -gt 2 ]; then
2530 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2531 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2533 $LFS getstripe $DIR/$tdir/a1/f3
2536 cancel_lru_locks osc
2538 echo "Inject failure..."
2539 echo "To simulate f0 lost MDT-object"
2540 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2541 do_facet mds1 $LCTL set_param fail_loc=0x1616
2542 rm -f $DIR/$tdir/a1/f0
2544 echo "To simulate f1 lost MDT-object and OST-object0"
2545 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2546 do_facet mds1 $LCTL set_param fail_loc=0x161a
2547 rm -f $DIR/$tdir/a1/f1
2549 echo "To simulate f2 lost MDT-object and OST-object1"
2550 do_facet mds1 $LCTL set_param fail_val=1
2551 rm -f $DIR/$tdir/a1/f2
2553 if [ $OSTCOUNT -gt 2 ]; then
2554 echo "To simulate f3 lost MDT-object and OST-object2"
2555 do_facet mds1 $LCTL set_param fail_val=2
2556 rm -f $DIR/$tdir/a1/f3
2559 umount_client $MOUNT
2562 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2564 echo "Inject failure to slow down the LFSCK on OST0"
2565 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2566 do_facet ost1 $LCTL set_param fail_loc=0x161b
2568 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2569 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2572 do_facet ost1 $LCTL set_param fail_loc=0
2574 for k in $(seq $MDSCOUNT); do
2575 # The LFSCK status query internal is 30 seconds. For the case
2576 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2577 # time to guarantee the status sync up.
2578 wait_update_facet mds${k} "$LCTL get_param -n \
2579 mdd.$(facet_svc mds${k}).lfsck_layout |
2580 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2581 error "(2) MDS${k} is not the expected 'completed'"
2584 for k in $(seq $OSTCOUNT); do
2585 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2586 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2587 awk '/^status/ { print $2 }')
2588 [ "$cur_status" == "completed" ] ||
2589 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2592 local repaired=$(do_facet mds1 $LCTL get_param -n \
2593 mdd.$(facet_svc mds1).lfsck_layout |
2594 awk '/^repaired_orphan/ { print $2 }')
2595 if [ $OSTCOUNT -gt 2 ]; then
2596 [ $repaired -eq 9 ] ||
2597 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2599 [ $repaired -eq 4 ] ||
2600 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2603 mount_client $MOUNT || error "(5.0) Fail to start client!"
2605 LOV_PATTERN_F_HOLE=0x40000000
2608 # ${fid0}-R-0 is the old f0
2610 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2611 echo "Check $name, which is the old f0"
2613 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2615 local pattern=0x$($LFS getstripe -L $name)
2616 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2617 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2619 local stripes=$($LFS getstripe -c $name)
2620 if [ $OSTCOUNT -gt 2 ]; then
2621 [ $stripes -eq 3 ] ||
2622 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2624 [ $stripes -eq 2 ] ||
2625 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2628 local size=$(stat $name | awk '/Size:/ { print $2 }')
2629 [ $size -eq $((4096 * $bcount)) ] ||
2630 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2632 cat $name > /dev/null || error "(5.5) cannot read $name"
2634 echo "dummy" >> $name || error "(5.6) cannot write $name"
2636 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2638 touch $name || error "(5.8) cannot touch $name"
2640 rm -f $name || error "(5.9) cannot unlink $name"
2643 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2645 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2646 if [ $OSTCOUNT -gt 2 ]; then
2647 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2649 echo "Check $name, it contains the old f1's stripe1"
2652 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2654 pattern=0x$($LFS getstripe -L $name)
2655 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2656 error "(6.2) expect pattern flag hole, but got $pattern"
2658 stripes=$($LFS getstripe -c $name)
2659 if [ $OSTCOUNT -gt 2 ]; then
2660 [ $stripes -eq 3 ] ||
2661 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2663 [ $stripes -eq 2 ] ||
2664 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2667 size=$(stat $name | awk '/Size:/ { print $2 }')
2668 [ $size -eq $((4096 * $bcount)) ] ||
2669 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2671 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2673 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2674 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2677 [ $failures -eq 256 ] ||
2678 error "(6.6) expect 256 IO failures, but get $failures"
2680 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2681 [ $size -eq $((4096 * $bcount)) ] ||
2682 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2684 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2685 error "(6.8) write to the LOV EA hole should fail"
2687 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2688 error "(6.9) write to normal stripe should NOT fail"
2690 echo "foo" >> $name && error "(6.10) append write $name should fail"
2692 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2694 touch $name || error "(6.12) cannot touch $name"
2696 rm -f $name || error "(6.13) cannot unlink $name"
2699 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2701 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2702 if [ $OSTCOUNT -gt 2 ]; then
2703 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2705 echo "Check $name, it contains the old f2's stripe0"
2708 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2710 pattern=0x$($LFS getstripe -L $name)
2711 stripes=$($LFS getstripe -c $name)
2712 size=$(stat $name | awk '/Size:/ { print $2 }')
2713 if [ $OSTCOUNT -gt 2 ]; then
2714 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2715 error "(7.2.1) expect pattern flag hole, but got $pattern"
2717 [ $stripes -eq 3 ] ||
2718 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2720 [ $size -eq $((4096 * $bcount)) ] ||
2721 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2723 cat $name > /dev/null &&
2724 error "(7.5.1) normal read $name should fail"
2726 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2727 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2729 [ $failures -eq 256 ] ||
2730 error "(7.6) expect 256 IO failures, but get $failures"
2732 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2733 [ $size -eq $((4096 * $bcount)) ] ||
2734 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2736 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2737 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2739 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2740 error "(7.8.1) write to normal stripe should NOT fail"
2742 echo "foo" >> $name &&
2743 error "(7.8.3) append write $name should fail"
2745 chown $RUNAS_ID:$RUNAS_GID $name ||
2746 error "(7.9.1) cannot chown on $name"
2748 touch $name || error "(7.10.1) cannot touch $name"
2750 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2751 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2753 [ $stripes -eq 1 ] ||
2754 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2757 [ $size -eq $((4096 * (256 + 0))) ] ||
2758 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2760 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2762 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2764 chown $RUNAS_ID:$RUNAS_GID $name ||
2765 error "(7.9.2) cannot chown on $name"
2767 touch $name || error "(7.10.2) cannot touch $name"
2770 rm -f $name || error "(7.11) cannot unlink $name"
2772 [ $OSTCOUNT -le 2 ] && return
2775 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2777 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2778 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2780 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2782 pattern=0x$($LFS getstripe -L $name)
2783 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2784 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2786 stripes=$($LFS getstripe -c $name)
2787 # LFSCK does not know the old f3 had 3 stripes.
2788 # It only tries to find as much as possible.
2789 # The stripe count depends on the last stripe's offset.
2790 [ $stripes -eq 2 ] ||
2791 error "(8.3) expect the stripe count is 2, but got $stripes"
2793 size=$(stat $name | awk '/Size:/ { print $2 }')
2795 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2796 error "(8.4) expect the size $((4096 * 512)), but got $size"
2798 cat $name > /dev/null || error "(8.5) cannot read $name"
2800 echo "dummy" >> $name || error "(8.6) cannot write $name"
2802 chown $RUNAS_ID:$RUNAS_GID $name ||
2803 error "(8.7) cannot chown on $name"
2805 touch $name || error "(8.8) cannot touch $name"
2807 rm -f $name || error "(8.9) cannot unlink $name"
2809 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2812 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2813 skip "ignore the test if MDS is older than 2.5.59" && return
2815 check_mount_and_prep
2816 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2818 echo "Start all LFSCK components by default (-s 1)"
2819 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2820 error "Fail to start LFSCK"
2822 echo "namespace LFSCK should be in 'scanning-phase1' status"
2823 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2824 [ "$STATUS" == "scanning-phase1" ] ||
2825 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2827 echo "layout LFSCK should be in 'scanning-phase1' status"
2828 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2829 [ "$STATUS" == "scanning-phase1" ] ||
2830 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2832 echo "Stop all LFSCK components by default"
2833 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2834 error "Fail to stop LFSCK"
2836 run_test 21 "run all LFSCK components by default"
2839 [ $MDSCOUNT -lt 2 ] &&
2840 skip "We need at least 2 MDSes for this test" && return
2843 echo "The parent_A references the child directory via some name entry,"
2844 echo "but the child directory back references another parent_B via its"
2845 echo "".." name entry. The parent_B does not exist. Then the namespace"
2846 echo "LFSCK will repair the child directory's ".." name entry."
2849 check_mount_and_prep
2851 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2852 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2854 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2855 echo "The dummy's dotdot name entry references the guard."
2856 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2857 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2858 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2859 error "(3) Fail to mkdir on MDT0"
2860 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2862 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2864 echo "Trigger namespace LFSCK to repair unmatched pairs"
2865 $START_NAMESPACE -A -r ||
2866 error "(5) Fail to start LFSCK for namespace"
2868 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2869 mdd.${MDT_DEV}.lfsck_namespace |
2870 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2872 error "(6) unexpected status"
2875 local repaired=$($SHOW_NAMESPACE |
2876 awk '/^unmatched_pairs_repaired/ { print $2 }')
2877 [ $repaired -eq 1 ] ||
2878 error "(7) Fail to repair unmatched pairs: $repaired"
2880 echo "'ls' should success after namespace LFSCK repairing"
2881 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2882 error "(8) ls should success."
2884 run_test 22a "LFSCK can repair unmatched pairs (1)"
2887 [ $MDSCOUNT -lt 2 ] &&
2888 skip "We need at least 2 MDSes for this test" && return
2891 echo "The parent_A references the child directory via the name entry_B,"
2892 echo "but the child directory back references another parent_C via its"
2893 echo "".." name entry. The parent_C exists, but there is no the name"
2894 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2895 echo "the child directory's ".." name entry and its linkEA."
2898 check_mount_and_prep
2900 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2901 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2903 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2904 echo "and bad linkEA. The dummy's dotdot name entry references the"
2905 echo "guard. The dummy's linkEA references n non-exist name entry."
2906 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2908 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2909 error "(3) Fail to mkdir on MDT0"
2910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2912 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2913 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2914 local dummyname=$($LFS fid2path $DIR $dummyfid)
2915 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2916 error "(4) fid2path works unexpectedly."
2918 echo "Trigger namespace LFSCK to repair unmatched pairs"
2919 $START_NAMESPACE -A -r ||
2920 error "(5) Fail to start LFSCK for namespace"
2922 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2923 mdd.${MDT_DEV}.lfsck_namespace |
2924 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2926 error "(6) unexpected status"
2929 local repaired=$($SHOW_NAMESPACE |
2930 awk '/^unmatched_pairs_repaired/ { print $2 }')
2931 [ $repaired -eq 1 ] ||
2932 error "(7) Fail to repair unmatched pairs: $repaired"
2934 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2935 local dummyname=$($LFS fid2path $DIR $dummyfid)
2936 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2937 error "(8) fid2path does not work"
2939 run_test 22b "LFSCK can repair unmatched pairs (2)"
2942 [ $MDSCOUNT -lt 2 ] &&
2943 skip "We need at least 2 MDSes for this test" && return
2946 echo "The name entry is there, but the MDT-object for such name "
2947 echo "entry does not exist. The namespace LFSCK should find out "
2948 echo "and repair the inconsistency as required."
2951 check_mount_and_prep
2953 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2954 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2956 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2957 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2958 do_facet mds2 $LCTL set_param fail_loc=0x1620
2959 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2960 do_facet mds2 $LCTL set_param fail_loc=0
2962 echo "'ls' should fail because of dangling name entry"
2963 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2965 echo "Trigger namespace LFSCK to find out dangling name entry"
2966 $START_NAMESPACE -A -r ||
2967 error "(5) Fail to start LFSCK for namespace"
2969 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2970 mdd.${MDT_DEV}.lfsck_namespace |
2971 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2973 error "(6) unexpected status"
2976 local repaired=$($SHOW_NAMESPACE |
2977 awk '/^dangling_repaired/ { print $2 }')
2978 [ $repaired -eq 1 ] ||
2979 error "(7) Fail to repair dangling name entry: $repaired"
2981 echo "'ls' should fail because not re-create MDT-object by default"
2982 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2984 echo "Trigger namespace LFSCK again to repair dangling name entry"
2985 $START_NAMESPACE -A -r -C ||
2986 error "(9) Fail to start LFSCK for namespace"
2988 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2989 mdd.${MDT_DEV}.lfsck_namespace |
2990 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2992 error "(10) unexpected status"
2995 repaired=$($SHOW_NAMESPACE |
2996 awk '/^dangling_repaired/ { print $2 }')
2997 [ $repaired -eq 1 ] ||
2998 error "(11) Fail to repair dangling name entry: $repaired"
3000 echo "'ls' should success after namespace LFSCK repairing"
3001 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3003 run_test 23a "LFSCK can repair dangling name entry (1)"
3007 echo "The objectA has multiple hard links, one of them corresponding"
3008 echo "to the name entry_B. But there is something wrong for the name"
3009 echo "entry_B and cause entry_B to references non-exist object_C."
3010 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3011 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3012 echo "comes to the second-stage scanning, it will find that the"
3013 echo "former re-creating object_C is not proper, and will try to"
3014 echo "replace the object_C with the real object_A."
3017 check_mount_and_prep
3019 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3020 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3021 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3023 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3024 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3026 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3027 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3029 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3031 echo "'ls' should fail because of dangling name entry"
3032 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3033 error "(6) ls should fail."
3035 echo "Trigger namespace LFSCK to find out dangling name entry"
3036 $START_NAMESPACE -r -C ||
3037 error "(7) Fail to start LFSCK for namespace"
3039 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3040 mdd.${MDT_DEV}.lfsck_namespace |
3041 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3043 error "(8) unexpected status"
3046 local repaired=$($SHOW_NAMESPACE |
3047 awk '/^dangling_repaired/ { print $2 }')
3048 [ $repaired -eq 1 ] ||
3049 error "(9) Fail to repair dangling name entry: $repaired"
3051 repaired=$($SHOW_NAMESPACE |
3052 awk '/^multiple_linked_repaired/ { print $2 }')
3053 [ $repaired -eq 1 ] ||
3054 error "(10) Fail to drop the former created object: $repaired"
3056 local data=$(cat $DIR/$tdir/d0/foo)
3057 [ "$data" == "dummy" ] ||
3058 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3060 run_test 23b "LFSCK can repair dangling name entry (2)"
3064 echo "The objectA has multiple hard links, one of them corresponding"
3065 echo "to the name entry_B. But there is something wrong for the name"
3066 echo "entry_B and cause entry_B to references non-exist object_C."
3067 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3068 echo "as dangling, and re-create the lost object_C. And then others"
3069 echo "modified the re-created object_C. When the LFSCK comes to the"
3070 echo "second-stage scanning, it will find that the former re-creating"
3071 echo "object_C maybe wrong and try to replace the object_C with the"
3072 echo "real object_A. But because object_C has been modified, so the"
3073 echo "LFSCK cannot replace it."
3076 check_mount_and_prep
3078 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3079 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3080 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3082 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3083 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3084 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3085 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3086 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3088 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3090 echo "'ls' should fail because of dangling name entry"
3091 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3092 error "(6) ls should fail."
3094 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3095 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3097 echo "Trigger namespace LFSCK to find out dangling name entry"
3098 $START_NAMESPACE -r -C ||
3099 error "(7) Fail to start LFSCK for namespace"
3101 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3102 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3103 stat $DIR/$tdir/guard
3105 error "(8) unexpected size"
3108 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3109 cancel_lru_locks osc
3111 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3112 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3113 mdd.${MDT_DEV}.lfsck_namespace |
3114 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3116 error "(10) unexpected status"
3119 local repaired=$($SHOW_NAMESPACE |
3120 awk '/^dangling_repaired/ { print $2 }')
3121 [ $repaired -eq 1 ] ||
3122 error "(11) Fail to repair dangling name entry: $repaired"
3124 local data=$(cat $DIR/$tdir/d0/foo)
3125 [ "$data" != "dummy" ] ||
3126 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3128 run_test 23c "LFSCK can repair dangling name entry (3)"
3131 [ $MDSCOUNT -lt 2 ] &&
3132 skip "We need at least 2 MDSes for this test" && return
3135 echo "Two MDT-objects back reference the same name entry via their"
3136 echo "each own linkEA entry, but the name entry only references one"
3137 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3138 echo "for the MDT-object that is not recognized. If such MDT-object"
3139 echo "has no other linkEA entry after the removing, then the LFSCK"
3140 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3143 check_mount_and_prep
3145 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3147 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3148 $LFS path2fid $DIR/$tdir/d0/guard
3150 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3151 $LFS path2fid $DIR/$tdir/d0/dummy
3154 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3155 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3157 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3160 touch $DIR/$tdir/d0/guard/foo ||
3161 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3163 echo "Inject failure stub on MDT0 to simulate the case that"
3164 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3165 echo "that references $DIR/$tdir/d0/guard/foo."
3166 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3167 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3168 echo "there with the same linkEA entry as another MDT-object"
3169 echo "$DIR/$tdir/d0/guard/foo has"
3171 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3173 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3174 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3175 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3176 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3177 rmdir $DIR/$tdir/d0/dummy/foo ||
3178 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3179 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3181 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3182 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3183 error "(6) stat successfully unexpectedly"
3185 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3186 $START_NAMESPACE -A -r ||
3187 error "(7) Fail to start LFSCK for namespace"
3189 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3190 mdd.${MDT_DEV}.lfsck_namespace |
3191 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3193 error "(8) unexpected status"
3196 local repaired=$($SHOW_NAMESPACE |
3197 awk '/^multiple_referenced_repaired/ { print $2 }')
3198 [ $repaired -eq 1 ] ||
3199 error "(9) Fail to repair multiple referenced name entry: $repaired"
3201 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3202 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3203 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3205 local cname="$cfid-$pfid-D-0"
3206 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3207 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3209 run_test 24 "LFSCK can repair multiple-referenced name entry"
3212 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3213 skip "Only support to inject failure on ldiskfs" && return
3216 echo "The file type in the name entry does not match the file type"
3217 echo "claimed by the referenced object. Then the LFSCK will update"
3218 echo "the file type in the name entry."
3221 check_mount_and_prep
3223 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3225 echo "Inject failure stub on MDT0 to simulate the case that"
3226 echo "the file type stored in the name entry is wrong."
3228 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3229 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3230 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3233 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3234 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3236 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3237 mdd.${MDT_DEV}.lfsck_namespace |
3238 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3240 error "(4) unexpected status"
3243 local repaired=$($SHOW_NAMESPACE |
3244 awk '/^bad_file_type_repaired/ { print $2 }')
3245 [ $repaired -eq 1 ] ||
3246 error "(5) Fail to repair bad file type in name entry: $repaired"
3248 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3250 run_test 25 "LFSCK can repair bad file type in the name entry"
3254 echo "The local name entry back referenced by the MDT-object is lost."
3255 echo "The namespace LFSCK will add the missing local name entry back"
3256 echo "to the normal namespace."
3259 check_mount_and_prep
3261 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3262 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3263 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3265 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3266 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3268 echo "Inject failure stub on MDT0 to simulate the case that"
3269 echo "foo's name entry will be removed, but the foo's object"
3270 echo "and its linkEA are kept in the system."
3272 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3273 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3274 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3277 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3279 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3280 $START_NAMESPACE -r -A ||
3281 error "(6) Fail to start LFSCK for namespace"
3283 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3284 mdd.${MDT_DEV}.lfsck_namespace |
3285 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3287 error "(7) unexpected status"
3290 local repaired=$($SHOW_NAMESPACE |
3291 awk '/^lost_dirent_repaired/ { print $2 }')
3292 [ $repaired -eq 1 ] ||
3293 error "(8) Fail to repair lost dirent: $repaired"
3295 ls -ail $DIR/$tdir/d0/foo ||
3296 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3298 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3299 [ "$foofid" == "$foofid2" ] ||
3300 error "(10) foo's FID changed: $foofid, $foofid2"
3302 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3305 [ $MDSCOUNT -lt 2 ] &&
3306 skip "We need at least 2 MDSes for this test" && return
3309 echo "The remote name entry back referenced by the MDT-object is lost."
3310 echo "The namespace LFSCK will add the missing remote name entry back"
3311 echo "to the normal namespace."
3314 check_mount_and_prep
3316 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3317 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3318 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3320 echo "Inject failure stub on MDT0 to simulate the case that"
3321 echo "foo's name entry will be removed, but the foo's object"
3322 echo "and its linkEA are kept in the system."
3324 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3325 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3326 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3329 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3331 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3332 $START_NAMESPACE -r -A ||
3333 error "(5) Fail to start LFSCK for namespace"
3335 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3336 mdd.${MDT_DEV}.lfsck_namespace |
3337 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3339 error "(6) unexpected status"
3342 local repaired=$($SHOW_NAMESPACE |
3343 awk '/^lost_dirent_repaired/ { print $2 }')
3344 [ $repaired -eq 1 ] ||
3345 error "(7) Fail to repair lost dirent: $repaired"
3347 ls -ail $DIR/$tdir/d0/foo ||
3348 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3350 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3351 [ "$foofid" == "$foofid2" ] ||
3352 error "(9) foo's FID changed: $foofid, $foofid2"
3354 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3358 echo "The local parent referenced by the MDT-object linkEA is lost."
3359 echo "The namespace LFSCK will re-create the lost parent as orphan."
3362 check_mount_and_prep
3364 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3365 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3366 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3367 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3369 echo "Inject failure stub on MDT0 to simulate the case that"
3370 echo "foo's name entry will be removed, but the foo's object"
3371 echo "and its linkEA are kept in the system. And then remove"
3372 echo "another hard link and the parent directory."
3374 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3375 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3376 rm -f $DIR/$tdir/d0/foo ||
3377 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3378 rm -f $DIR/$tdir/d0/dummy ||
3379 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3382 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3383 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3385 echo "Trigger namespace LFSCK to repair the lost parent"
3386 $START_NAMESPACE -r -A ||
3387 error "(6) Fail to start LFSCK for namespace"
3389 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3390 mdd.${MDT_DEV}.lfsck_namespace |
3391 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3393 error "(7) unexpected status"
3396 local repaired=$($SHOW_NAMESPACE |
3397 awk '/^lost_dirent_repaired/ { print $2 }')
3398 [ $repaired -eq 1 ] ||
3399 error "(8) Fail to repair lost dirent: $repaired"
3401 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3402 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3403 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3405 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3407 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3408 [ ! -z "$cname" ] ||
3409 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3411 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3414 [ $MDSCOUNT -lt 2 ] &&
3415 skip "We need at least 2 MDSes for this test" && return
3418 echo "The remote parent referenced by the MDT-object linkEA is lost."
3419 echo "The namespace LFSCK will re-create the lost parent as orphan."
3422 check_mount_and_prep
3424 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3425 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3427 $LFS path2fid $DIR/$tdir/d0
3429 echo "Inject failure stub on MDT0 to simulate the case that"
3430 echo "foo's name entry will be removed, but the foo's object"
3431 echo "and its linkEA are kept in the system. And then remove"
3432 echo "the parent directory."
3434 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3435 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3436 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3439 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3440 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3442 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3443 $START_NAMESPACE -r -A ||
3444 error "(6) Fail to start LFSCK for namespace"
3446 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3447 mdd.${MDT_DEV}.lfsck_namespace |
3448 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3450 error "(7) unexpected status"
3453 local repaired=$($SHOW_NAMESPACE |
3454 awk '/^lost_dirent_repaired/ { print $2 }')
3455 [ $repaired -eq 1 ] ||
3456 error "(8) Fail to repair lost dirent: $repaired"
3458 ls -ail $MOUNT/.lustre/lost+found/
3460 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3461 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3462 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3464 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3466 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3467 [ ! -z "$cname" ] ||
3468 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3470 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3473 [ $MDSCOUNT -lt 2 ] &&
3474 skip "The test needs at least 2 MDTs" && return
3477 echo "The target name entry is lost. The LFSCK should insert the"
3478 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3479 echo "the MDT (on which the orphan MDT-object resides) has ever"
3480 echo "failed to respond some name entry verification during the"
3481 echo "first stage-scanning, then the LFSCK should skip to handle"
3482 echo "orphan MDT-object on this MDT. But other MDTs should not"
3486 check_mount_and_prep
3487 $LFS mkdir -i 0 $DIR/$tdir/d1
3488 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3489 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3491 $LFS mkdir -i 1 $DIR/$tdir/d2
3492 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3493 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3495 echo "Inject failure stub on MDT0 to simulate the case that"
3496 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3497 echo "and its linkEA are kept in the system. And the case that"
3498 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3499 echo "and its linkEA are kept in the system."
3501 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3502 do_facet mds1 $LCTL set_param fail_loc=0x1624
3503 do_facet mds2 $LCTL set_param fail_loc=0x1624
3504 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3505 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3506 do_facet mds1 $LCTL set_param fail_loc=0
3507 do_facet mds2 $LCTL set_param fail_loc=0
3509 cancel_lru_locks mdc
3510 cancel_lru_locks osc
3512 echo "Inject failure, to simulate the MDT0 fail to handle"
3513 echo "MDT1 LFSCK request during the first-stage scanning."
3514 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3515 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3517 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3518 $START_NAMESPACE -r -A ||
3519 error "(3) Fail to start LFSCK for namespace"
3521 wait_update_facet mds1 "$LCTL get_param -n \
3522 mdd.$(facet_svc mds1).lfsck_namespace |
3523 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3524 error "(4) mds1 is not the expected 'partial'"
3527 wait_update_facet mds2 "$LCTL get_param -n \
3528 mdd.$(facet_svc mds2).lfsck_namespace |
3529 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3530 error "(5) mds2 is not the expected 'completed'"
3533 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3535 local repaired=$(do_facet mds1 $LCTL get_param -n \
3536 mdd.$(facet_svc mds1).lfsck_namespace |
3537 awk '/^lost_dirent_repaired/ { print $2 }')
3538 [ $repaired -eq 0 ] ||
3539 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3541 repaired=$(do_facet mds2 $LCTL get_param -n \
3542 mdd.$(facet_svc mds2).lfsck_namespace |
3543 awk '/^lost_dirent_repaired/ { print $2 }')
3544 [ $repaired -eq 1 ] ||
3545 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3547 echo "Trigger namespace LFSCK on all devices again to cleanup"
3548 $START_NAMESPACE -r -A ||
3549 error "(8) Fail to start LFSCK for namespace"
3551 for k in $(seq $MDSCOUNT); do
3552 # The LFSCK status query internal is 30 seconds. For the case
3553 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3554 # time to guarantee the status sync up.
3555 wait_update_facet mds${k} "$LCTL get_param -n \
3556 mdd.$(facet_svc mds${k}).lfsck_namespace |
3557 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3558 error "(9) MDS${k} is not the expected 'completed'"
3561 local repaired=$(do_facet mds1 $LCTL get_param -n \
3562 mdd.$(facet_svc mds1).lfsck_namespace |
3563 awk '/^lost_dirent_repaired/ { print $2 }')
3564 [ $repaired -eq 1 ] ||
3565 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3567 repaired=$(do_facet mds2 $LCTL get_param -n \
3568 mdd.$(facet_svc mds2).lfsck_namespace |
3569 awk '/^lost_dirent_repaired/ { print $2 }')
3570 [ $repaired -eq 0 ] ||
3571 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3573 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3577 echo "The object's nlink attribute is larger than the object's known"
3578 echo "name entries count. The LFSCK will repair the object's nlink"
3579 echo "attribute to match the known name entries count"
3582 check_mount_and_prep
3584 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3585 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3587 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3588 echo "nlink attribute is larger than its name entries count."
3590 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3592 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3593 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3594 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3596 cancel_lru_locks mdc
3597 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3598 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3600 echo "Trigger namespace LFSCK to repair the nlink count"
3601 $START_NAMESPACE -r -A ||
3602 error "(5) Fail to start LFSCK for namespace"
3604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3605 mdd.${MDT_DEV}.lfsck_namespace |
3606 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3608 error "(6) unexpected status"
3611 local repaired=$($SHOW_NAMESPACE |
3612 awk '/^nlinks_repaired/ { print $2 }')
3613 [ $repaired -eq 1 ] ||
3614 error "(7) Fail to repair nlink count: $repaired"
3616 cancel_lru_locks mdc
3617 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3618 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3620 run_test 29a "LFSCK can repair bad nlink count (1)"
3624 echo "The object's nlink attribute is smaller than the object's known"
3625 echo "name entries count. The LFSCK will repair the object's nlink"
3626 echo "attribute to match the known name entries count"
3629 check_mount_and_prep
3631 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3632 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3634 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3635 echo "nlink attribute is smaller than its name entries count."
3637 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3638 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3639 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3640 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3643 cancel_lru_locks mdc
3644 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3645 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3647 echo "Trigger namespace LFSCK to repair the nlink count"
3648 $START_NAMESPACE -r -A ||
3649 error "(5) Fail to start LFSCK for namespace"
3651 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3652 mdd.${MDT_DEV}.lfsck_namespace |
3653 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3655 error "(6) unexpected status"
3658 local repaired=$($SHOW_NAMESPACE |
3659 awk '/^nlinks_repaired/ { print $2 }')
3660 [ $repaired -eq 1 ] ||
3661 error "(7) Fail to repair nlink count: $repaired"
3663 cancel_lru_locks mdc
3664 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3665 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3667 run_test 29b "LFSCK can repair bad nlink count (2)"
3671 echo "There are too many hard links to the object, and exceeds the"
3672 echo "object's linkEA limitation, as to NOT all the known name entries"
3673 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3674 echo "skip the nlink verification for this object."
3677 check_mount_and_prep
3679 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3680 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3681 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3682 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3684 echo "Inject failure stub on MDT0 to simulate the case that"
3685 echo "foo's hard links exceed the object's linkEA limitation."
3687 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3688 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3689 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3690 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3692 cancel_lru_locks mdc
3694 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3695 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3697 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3698 $LFS fid2path $DIR $foofid
3699 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3700 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3702 echo "Trigger namespace LFSCK to repair the nlink count"
3703 $START_NAMESPACE -r -A ||
3704 error "(7) Fail to start LFSCK for namespace"
3706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3707 mdd.${MDT_DEV}.lfsck_namespace |
3708 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3710 error "(8) unexpected status"
3713 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3714 local repaired=$($SHOW_NAMESPACE |
3715 awk '/^nlinks_repaired/ { print $2 }')
3716 [ $repaired -eq 0 ] ||
3717 error "(9) Repair nlink count unexpcetedly: $repaired"
3719 cancel_lru_locks mdc
3721 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3722 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3724 count2=$($LFS fid2path $DIR $foofid | wc -l)
3725 [ $count2 -eq 2 ] ||
3726 error "(11) Repaired something unexpectedly: $count2"
3728 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3731 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3732 skip "Only support backend /lost+found for ldiskfs" && return
3735 echo "The namespace LFSCK will move the orphans from backend"
3736 echo "/lost+found directory to normal client visible namespace"
3737 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3740 check_mount_and_prep
3742 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3743 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3745 echo "Inject failure stub on MDT0 to simulate the case that"
3746 echo "directory d0 has no linkEA entry, then the LFSCK will"
3747 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3749 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3751 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3752 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3754 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3755 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3757 echo "Inject failure stub on MDT0 to simulate the case that the"
3758 echo "object's name entry will be removed, but not destroy the"
3759 echo "object. Then backend e2fsck will handle it as orphan and"
3760 echo "add them into the backend /lost+found directory."
3762 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3763 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3764 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3765 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3766 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3767 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3768 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3770 umount_client $MOUNT || error "(10) Fail to stop client!"
3772 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3775 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3776 error "(12) Fail to run e2fsck"
3778 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3779 error "(13) Fail to start MDT0"
3781 echo "Trigger namespace LFSCK to recover backend orphans"
3782 $START_NAMESPACE -r -A ||
3783 error "(14) Fail to start LFSCK for namespace"
3785 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3786 mdd.${MDT_DEV}.lfsck_namespace |
3787 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3789 error "(15) unexpected status"
3792 local repaired=$($SHOW_NAMESPACE |
3793 awk '/^local_lost_found_moved/ { print $2 }')
3794 [ $repaired -ge 4 ] ||
3795 error "(16) Fail to recover backend orphans: $repaired"
3797 mount_client $MOUNT || error "(17) Fail to start client!"
3799 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3801 ls -ail $MOUNT/.lustre/lost+found/
3803 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3804 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3805 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3807 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3809 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3810 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3812 stat ${cname}/d1 || error "(21) d0 is not recovered"
3813 stat ${cname}/f1 || error "(22) f1 is not recovered"
3815 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3818 [ $MDSCOUNT -lt 2 ] &&
3819 skip "The test needs at least 2 MDTs" && return
3822 echo "For the name entry under a striped directory, if the name"
3823 echo "hash does not match the shard, then the LFSCK will repair"
3824 echo "the bad name entry"
3827 check_mount_and_prep
3829 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3830 error "(1) Fail to create striped directory"
3832 echo "Inject failure stub on client to simulate the case that"
3833 echo "some name entry should be inserted into other non-first"
3834 echo "shard, but inserted into the first shard by wrong"
3836 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3837 $LCTL set_param fail_loc=0x1628 fail_val=0
3838 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3839 error "(2) Fail to create file under striped directory"
3840 $LCTL set_param fail_loc=0 fail_val=0
3842 echo "Trigger namespace LFSCK to repair bad name hash"
3843 $START_NAMESPACE -r -A ||
3844 error "(3) Fail to start LFSCK for namespace"
3846 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3847 mdd.${MDT_DEV}.lfsck_namespace |
3848 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3850 error "(4) unexpected status"
3853 local repaired=$($SHOW_NAMESPACE |
3854 awk '/^name_hash_repaired/ { print $2 }')
3855 [ $repaired -ge 1 ] ||
3856 error "(5) Fail to repair bad name hash: $repaired"
3858 umount_client $MOUNT || error "(6) umount failed"
3859 mount_client $MOUNT || error "(7) mount failed"
3861 for ((i = 0; i < $MDSCOUNT; i++)); do
3862 stat $DIR/$tdir/striped_dir/d$i ||
3863 error "(8) Fail to stat d$i after LFSCK"
3864 rmdir $DIR/$tdir/striped_dir/d$i ||
3865 error "(9) Fail to unlink d$i after LFSCK"
3868 rmdir $DIR/$tdir/striped_dir ||
3869 error "(10) Fail to remove the striped directory after LFSCK"
3871 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3874 [ $MDSCOUNT -lt 2 ] &&
3875 skip "The test needs at least 2 MDTs" && return
3878 echo "For the name entry under a striped directory, if the name"
3879 echo "hash does not match the shard, then the LFSCK will repair"
3880 echo "the bad name entry"
3883 check_mount_and_prep
3885 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3886 error "(1) Fail to create striped directory"
3888 echo "Inject failure stub on client to simulate the case that"
3889 echo "some name entry should be inserted into other non-second"
3890 echo "shard, but inserted into the secod shard by wrong"
3892 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3893 $LCTL set_param fail_loc=0x1628 fail_val=1
3894 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3895 error "(2) Fail to create file under striped directory"
3896 $LCTL set_param fail_loc=0 fail_val=0
3898 echo "Trigger namespace LFSCK to repair bad name hash"
3899 $START_NAMESPACE -r -A ||
3900 error "(3) Fail to start LFSCK for namespace"
3902 wait_update_facet mds2 "$LCTL get_param -n \
3903 mdd.$(facet_svc mds2).lfsck_namespace |
3904 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3905 error "(4) unexpected status"
3907 local repaired=$(do_facet mds2 $LCTL get_param -n \
3908 mdd.$(facet_svc mds2).lfsck_namespace |
3909 awk '/^name_hash_repaired/ { print $2 }')
3910 [ $repaired -ge 1 ] ||
3911 error "(5) Fail to repair bad name hash: $repaired"
3913 umount_client $MOUNT || error "(6) umount failed"
3914 mount_client $MOUNT || error "(7) mount failed"
3916 for ((i = 0; i < $MDSCOUNT; i++)); do
3917 stat $DIR/$tdir/striped_dir/d$i ||
3918 error "(8) Fail to stat d$i after LFSCK"
3919 rmdir $DIR/$tdir/striped_dir/d$i ||
3920 error "(9) Fail to unlink d$i after LFSCK"
3923 rmdir $DIR/$tdir/striped_dir ||
3924 error "(10) Fail to remove the striped directory after LFSCK"
3926 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3929 [ $MDSCOUNT -lt 2 ] &&
3930 skip "The test needs at least 2 MDTs" && return
3933 echo "For some reason, the master MDT-object of the striped directory"
3934 echo "may lost its master LMV EA. If nobody created files under the"
3935 echo "master directly after the master LMV EA lost, then the LFSCK"
3936 echo "should re-generate the master LMV EA."
3939 check_mount_and_prep
3941 echo "Inject failure stub on MDT0 to simulate the case that the"
3942 echo "master MDT-object of the striped directory lost the LMV EA."
3944 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3946 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3947 error "(1) Fail to create striped directory"
3948 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3950 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3951 $START_NAMESPACE -r -A ||
3952 error "(2) Fail to start LFSCK for namespace"
3954 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3955 mdd.${MDT_DEV}.lfsck_namespace |
3956 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3958 error "(3) unexpected status"
3961 local repaired=$($SHOW_NAMESPACE |
3962 awk '/^striped_dirs_repaired/ { print $2 }')
3963 [ $repaired -eq 1 ] ||
3964 error "(4) Fail to re-generate master LMV EA: $repaired"
3966 umount_client $MOUNT || error "(5) umount failed"
3967 mount_client $MOUNT || error "(6) mount failed"
3969 local empty=$(ls $DIR/$tdir/striped_dir/)
3970 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3972 rmdir $DIR/$tdir/striped_dir ||
3973 error "(8) Fail to remove the striped directory after LFSCK"
3975 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3978 [ $MDSCOUNT -lt 2 ] &&
3979 skip "The test needs at least 2 MDTs" && return
3982 echo "For some reason, the master MDT-object of the striped directory"
3983 echo "may lost its master LMV EA. If somebody created files under the"
3984 echo "master directly after the master LMV EA lost, then the LFSCK"
3985 echo "should NOT re-generate the master LMV EA, instead, it should"
3986 echo "change the broken striped dirctory as read-only to prevent"
3987 echo "further damage"
3990 check_mount_and_prep
3992 echo "Inject failure stub on MDT0 to simulate the case that the"
3993 echo "master MDT-object of the striped directory lost the LMV EA."
3995 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3996 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3997 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3998 error "(1) Fail to create striped directory"
3999 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4001 umount_client $MOUNT || error "(2) umount failed"
4002 mount_client $MOUNT || error "(3) mount failed"
4004 touch $DIR/$tdir/striped_dir/dummy ||
4005 error "(4) Fail to touch under broken striped directory"
4007 echo "Trigger namespace LFSCK to find out the inconsistency"
4008 $START_NAMESPACE -r -A ||
4009 error "(5) Fail to start LFSCK for namespace"
4011 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4012 mdd.${MDT_DEV}.lfsck_namespace |
4013 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4015 error "(6) unexpected status"
4018 local repaired=$($SHOW_NAMESPACE |
4019 awk '/^striped_dirs_repaired/ { print $2 }')
4020 [ $repaired -eq 0 ] ||
4021 error "(7) Re-generate master LMV EA unexpected: $repaired"
4023 stat $DIR/$tdir/striped_dir/dummy ||
4024 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4026 touch $DIR/$tdir/striped_dir/foo &&
4027 error "(9) The broken striped directory should be read-only"
4029 chattr -i $DIR/$tdir/striped_dir ||
4030 error "(10) Fail to chattr on the broken striped directory"
4032 rmdir $DIR/$tdir/striped_dir ||
4033 error "(11) Fail to remove the striped directory after LFSCK"
4035 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4038 [ $MDSCOUNT -lt 2 ] &&
4039 skip "The test needs at least 2 MDTs" && return
4042 echo "For some reason, the slave MDT-object of the striped directory"
4043 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4044 echo "slave LMV EA."
4047 check_mount_and_prep
4049 echo "Inject failure stub on MDT0 to simulate the case that the"
4050 echo "slave MDT-object (that resides on the same MDT as the master"
4051 echo "MDT-object resides on) lost the LMV EA."
4053 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4055 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4056 error "(1) Fail to create striped directory"
4057 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4059 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4060 $START_NAMESPACE -r -A ||
4061 error "(2) Fail to start LFSCK for namespace"
4063 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4064 mdd.${MDT_DEV}.lfsck_namespace |
4065 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4067 error "(3) unexpected status"
4070 local repaired=$($SHOW_NAMESPACE |
4071 awk '/^striped_shards_repaired/ { print $2 }')
4072 [ $repaired -eq 1 ] ||
4073 error "(4) Fail to re-generate slave LMV EA: $repaired"
4075 rmdir $DIR/$tdir/striped_dir ||
4076 error "(5) Fail to remove the striped directory after LFSCK"
4078 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4081 [ $MDSCOUNT -lt 2 ] &&
4082 skip "The test needs at least 2 MDTs" && return
4085 echo "For some reason, the slave MDT-object of the striped directory"
4086 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4087 echo "slave LMV EA."
4090 check_mount_and_prep
4092 echo "Inject failure stub on MDT0 to simulate the case that the"
4093 echo "slave MDT-object (that resides on differnt MDT as the master"
4094 echo "MDT-object resides on) lost the LMV EA."
4096 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4097 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4098 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4099 error "(1) Fail to create striped directory"
4100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4102 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4103 $START_NAMESPACE -r -A ||
4104 error "(2) Fail to start LFSCK for namespace"
4106 wait_update_facet mds2 "$LCTL get_param -n \
4107 mdd.$(facet_svc mds2).lfsck_namespace |
4108 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4109 error "(3) unexpected status"
4111 local repaired=$(do_facet mds2 $LCTL get_param -n \
4112 mdd.$(facet_svc mds2).lfsck_namespace |
4113 awk '/^striped_shards_repaired/ { print $2 }')
4114 [ $repaired -eq 1 ] ||
4115 error "(4) Fail to re-generate slave LMV EA: $repaired"
4117 rmdir $DIR/$tdir/striped_dir ||
4118 error "(5) Fail to remove the striped directory after LFSCK"
4120 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4123 [ $MDSCOUNT -lt 2 ] &&
4124 skip "The test needs at least 2 MDTs" && return
4127 echo "For some reason, the stripe index in the slave LMV EA is"
4128 echo "corrupted. The LFSCK should repair the slave LMV EA."
4131 check_mount_and_prep
4133 echo "Inject failure stub on MDT0 to simulate the case that the"
4134 echo "slave LMV EA on the first shard of the striped directory"
4135 echo "claims the same index as the second shard claims"
4137 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4139 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4140 error "(1) Fail to create striped directory"
4141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4143 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4144 $START_NAMESPACE -r -A ||
4145 error "(2) Fail to start LFSCK for namespace"
4147 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4148 mdd.${MDT_DEV}.lfsck_namespace |
4149 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4151 error "(3) unexpected status"
4154 local repaired=$($SHOW_NAMESPACE |
4155 awk '/^striped_shards_repaired/ { print $2 }')
4156 [ $repaired -eq 1 ] ||
4157 error "(4) Fail to repair slave LMV EA: $repaired"
4159 umount_client $MOUNT || error "(5) umount failed"
4160 mount_client $MOUNT || error "(6) mount failed"
4162 touch $DIR/$tdir/striped_dir/foo ||
4163 error "(7) Fail to touch file after the LFSCK"
4165 rm -f $DIR/$tdir/striped_dir/foo ||
4166 error "(8) Fail to unlink file after the LFSCK"
4168 rmdir $DIR/$tdir/striped_dir ||
4169 error "(9) Fail to remove the striped directory after LFSCK"
4171 run_test 31g "Repair the corrupted slave LMV EA"
4174 [ $MDSCOUNT -lt 2 ] &&
4175 skip "The test needs at least 2 MDTs" && return
4178 echo "For some reason, the shard's name entry in the striped"
4179 echo "directory may be corrupted. The LFSCK should repair the"
4180 echo "bad shard's name entry."
4183 check_mount_and_prep
4185 echo "Inject failure stub on MDT0 to simulate the case that the"
4186 echo "first shard's name entry in the striped directory claims"
4187 echo "the same index as the second shard's name entry claims."
4189 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4191 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4192 error "(1) Fail to create striped directory"
4193 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4195 echo "Trigger namespace LFSCK to repair the shard's name entry"
4196 $START_NAMESPACE -r -A ||
4197 error "(2) Fail to start LFSCK for namespace"
4199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4200 mdd.${MDT_DEV}.lfsck_namespace |
4201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4203 error "(3) unexpected status"
4206 local repaired=$($SHOW_NAMESPACE |
4207 awk '/^dirent_repaired/ { print $2 }')
4208 [ $repaired -eq 1 ] ||
4209 error "(4) Fail to repair shard's name entry: $repaired"
4211 umount_client $MOUNT || error "(5) umount failed"
4212 mount_client $MOUNT || error "(6) mount failed"
4214 touch $DIR/$tdir/striped_dir/foo ||
4215 error "(7) Fail to touch file after the LFSCK"
4217 rm -f $DIR/$tdir/striped_dir/foo ||
4218 error "(8) Fail to unlink file after the LFSCK"
4220 rmdir $DIR/$tdir/striped_dir ||
4221 error "(9) Fail to remove the striped directory after LFSCK"
4223 run_test 31h "Repair the corrupted shard's name entry"
4225 # restore MDS/OST size
4226 MDSSIZE=${SAVED_MDSSIZE}
4227 OSTSIZE=${SAVED_OSTSIZE}
4228 OSTCOUNT=${SAVED_OSTCOUNT}
4230 # cleanup the system at last