3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 SAVED_MDSSIZE=${MDSSIZE}
28 SAVED_OSTSIZE=${OSTSIZE}
29 SAVED_OSTCOUNT=${OSTCOUNT}
30 # use small MDS + OST size to speed formatting time
31 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
34 # no need too many OSTs, to reduce the format/start/stop overhead
35 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
37 # build up a clean test environment.
41 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
42 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
45 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
48 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
51 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
54 # DNE does not support striped directory on zfs-based backend yet.
55 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
56 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
60 MDT_DEV="${FSNAME}-MDT0000"
61 OST_DEV="${FSNAME}-OST0000"
62 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
63 START_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
65 START_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
67 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
68 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
69 SHOW_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
71 SHOW_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
73 SHOW_LAYOUT_ON_OST="do_facet ost1 \
74 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
75 MOUNT_OPTS_SCRUB="-o user_xattr"
76 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
85 echo "preparing... $nfiles * $ndirs files will be created $(date)."
86 if [ ! -z $igif ]; then
87 #define OBD_FAIL_FID_IGIF 0x1504
88 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
91 cp $LUSTRE/tests/*.sh $DIR/$tdir/
92 if [ $ndirs -gt 0 ]; then
93 createmany -d $DIR/$tdir/d $ndirs
94 createmany -m $DIR/$tdir/f $ndirs
95 if [ $nfiles -gt 0 ]; then
96 for ((i = 0; i < $ndirs; i++)); do
97 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
98 /dev/null || error "createmany $nfiles"
101 createmany -d $DIR/$tdir/e $ndirs
104 if [ ! -z $igif ]; then
105 touch $DIR/$tdir/dummy
106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
109 echo "prepared $(date)."
115 #define OBD_FAIL_LFSCK_DELAY1 0x1600
116 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
117 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
119 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
121 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
122 [ "$STATUS" == "scanning-phase1" ] ||
123 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
125 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
127 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
128 [ "$STATUS" == "stopped" ] ||
129 error "(6) Expect 'stopped', but got '$STATUS'"
131 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
133 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
134 [ "$STATUS" == "scanning-phase1" ] ||
135 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
138 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
139 mdd.${MDT_DEV}.lfsck_namespace |
140 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
142 error "(9) unexpected status"
145 local repaired=$($SHOW_NAMESPACE |
146 awk '/^updated_phase1/ { print $2 }')
147 [ $repaired -eq 0 ] ||
148 error "(10) Expect nothing to be repaired, but got: $repaired"
150 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
151 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
152 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
153 mdd.${MDT_DEV}.lfsck_namespace |
154 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
156 error "(12) unexpected status"
159 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
160 [ $((scanned1 + 1)) -eq $scanned2 ] ||
161 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
163 echo "stopall, should NOT crash LU-3649"
164 stopall || error "(14) Fail to stopall"
166 run_test 0 "Control LFSCK manually"
169 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
170 skip "OI Scrub not implemented for ZFS" && return
174 #define OBD_FAIL_FID_INDIR 0x1501
175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
176 touch $DIR/$tdir/dummy
178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
180 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
181 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
182 mdd.${MDT_DEV}.lfsck_namespace |
183 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
185 error "(4) unexpected status"
188 local repaired=$($SHOW_NAMESPACE |
189 awk '/^dirent_repaired/ { print $2 }')
190 # for interop with old server
191 [ -z "$repaired" ] &&
192 repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 1 ] ||
196 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
198 mount_client $MOUNT || error "(6) Fail to start client!"
200 #define OBD_FAIL_FID_LOOKUP 0x1505
201 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
202 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
206 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
210 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
211 skip "OI Scrub not implemented for ZFS" && return
215 #define OBD_FAIL_FID_INLMA 0x1502
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
217 touch $DIR/$tdir/dummy
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
221 #define OBD_FAIL_FID_NOLMA 0x1506
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
223 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
224 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
225 mdd.${MDT_DEV}.lfsck_namespace |
226 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
228 error "(4) unexpected status"
231 local repaired=$($SHOW_NAMESPACE |
232 awk '/^dirent_repaired/ { print $2 }')
233 # for interop with old server
234 [ -z "$repaired" ] &&
235 repaired=$($SHOW_NAMESPACE |
236 awk '/^updated_phase1/ { print $2 }')
238 [ $repaired -eq 1 ] ||
239 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
242 mount_client $MOUNT || error "(6) Fail to start client!"
244 #define OBD_FAIL_FID_LOOKUP 0x1505
245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
246 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
250 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
255 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
257 touch $DIR/$tdir/dummy
259 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
261 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
262 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
263 mdd.${MDT_DEV}.lfsck_namespace |
264 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
266 error "(4) unexpected status"
269 local repaired=$($SHOW_NAMESPACE |
270 awk '/^linkea_repaired/ { print $2 }')
271 # for interop with old server
272 [ -z "$repaired" ] &&
273 repaired=$($SHOW_NAMESPACE |
274 awk '/^updated_phase2/ { print $2 }')
276 [ $repaired -eq 1 ] ||
277 error "(5) Fail to repair crashed linkEA: $repaired"
279 mount_client $MOUNT || error "(6) Fail to start client!"
281 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
282 error "(7) Fail to stat $DIR/$tdir/dummy"
284 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
285 local dummyname=$($LFS fid2path $DIR $dummyfid)
286 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
287 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
289 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
295 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
297 touch $DIR/$tdir/dummy
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
303 mdd.${MDT_DEV}.lfsck_namespace |
304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
306 error "(4) unexpected status"
309 local repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase2/ { print $2 }')
311 [ $repaired -eq 1 ] ||
312 error "(5) Fail to repair crashed linkEA: $repaired"
314 mount_client $MOUNT || error "(6) Fail to start client!"
316 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
317 error "(7) Fail to stat $DIR/$tdir/dummy"
319 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
320 local dummyname=$($LFS fid2path $DIR $dummyfid)
321 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
322 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
324 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
330 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
332 touch $DIR/$tdir/dummy
334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
336 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
337 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
338 mdd.${MDT_DEV}.lfsck_namespace |
339 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
341 error "(4) unexpected status"
344 local repaired=$($SHOW_NAMESPACE |
345 awk '/^updated_phase2/ { print $2 }')
346 [ $repaired -eq 1 ] ||
347 error "(5) Fail to repair crashed linkEA: $repaired"
349 mount_client $MOUNT || error "(6) Fail to start client!"
351 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
352 error "(7) Fail to stat $DIR/$tdir/dummy"
354 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
355 local dummyname=$($LFS fid2path $DIR $dummyfid)
356 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
357 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
359 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
365 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
367 touch $DIR/$tdir/dummy
369 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
371 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
372 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
373 mdd.${MDT_DEV}.lfsck_namespace |
374 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
376 error "(4) unexpected status"
379 local repaired=$($SHOW_NAMESPACE |
380 awk '/^linkea_repaired/ { print $2 }')
381 [ $repaired -eq 1 ] ||
382 error "(5) Fail to repair crashed linkEA: $repaired"
384 mount_client $MOUNT || error "(6) Fail to start client!"
386 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
387 error "(7) Fail to stat $DIR/$tdir/dummy"
389 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
390 local dummyname=$($LFS fid2path $DIR $dummyfid)
391 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
392 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
394 run_test 2d "LFSCK can recover the missing linkEA entry"
398 [ $MDSCOUNT -lt 2 ] &&
399 skip "We need at least 2 MDSes for this test" && return
403 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
405 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
406 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
407 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
410 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
412 mdd.${MDT_DEV}.lfsck_namespace |
413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
415 error "(4) unexpected status"
418 local repaired=$($SHOW_NAMESPACE |
419 awk '/^linkea_repaired/ { print $2 }')
420 [ $repaired -eq 1 ] ||
421 error "(5) Fail to repair crashed linkEA: $repaired"
423 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
424 local name=$($LFS fid2path $DIR $fid)
425 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
426 error "(6) Fail to repair linkEA: $fid $name"
428 run_test 2e "namespace LFSCK can verify remote object linkEA"
434 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
435 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
436 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
438 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
439 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
440 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
442 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
444 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
446 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
448 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
452 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
453 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
454 mdd.${MDT_DEV}.lfsck_namespace |
455 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
457 error "(10) unexpected status"
460 local checked=$($SHOW_NAMESPACE |
461 awk '/^checked_phase2/ { print $2 }')
462 [ $checked -ge 4 ] ||
463 error "(11) Fail to check multiple-linked object: $checked"
465 local repaired=$($SHOW_NAMESPACE |
466 awk '/^multiple_linked_repaired/ { print $2 }')
467 [ $repaired -ge 2 ] ||
468 error "(12) Fail to repair multiple-linked object: $repaired"
470 run_test 3 "LFSCK can verify multiple-linked objects"
474 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
475 skip "OI Scrub not implemented for ZFS" && return
478 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
479 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
481 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
482 echo "start $SINGLEMDS with disabling OI scrub"
483 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
484 error "(2) Fail to start MDS!"
486 #define OBD_FAIL_LFSCK_DELAY2 0x1601
487 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
488 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
489 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
490 mdd.${MDT_DEV}.lfsck_namespace |
491 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
493 error "(5) unexpected status"
496 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
497 [ "$STATUS" == "scanning-phase1" ] ||
498 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
502 mdd.${MDT_DEV}.lfsck_namespace |
503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
505 error "(7) unexpected status"
508 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
509 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^dirent_repaired/ { print $2 }')
513 # for interop with old server
514 [ -z "$repaired" ] &&
515 repaired=$($SHOW_NAMESPACE |
516 awk '/^updated_phase1/ { print $2 }')
518 [ $repaired -ge 9 ] ||
519 error "(9) Fail to re-generate FID-in-dirent: $repaired"
521 mount_client $MOUNT || error "(10) Fail to start client!"
523 #define OBD_FAIL_FID_LOOKUP 0x1505
524 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
525 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
526 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
528 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
532 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
533 skip "OI Scrub not implemented for ZFS" && return
536 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
537 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
539 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
540 echo "start $SINGLEMDS with disabling OI scrub"
541 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
542 error "(2) Fail to start MDS!"
544 #define OBD_FAIL_LFSCK_DELAY2 0x1601
545 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
546 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
548 mdd.${MDT_DEV}.lfsck_namespace |
549 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
551 error "(5) unexpected status"
554 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
555 [ "$STATUS" == "scanning-phase1" ] ||
556 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
558 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
559 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
560 mdd.${MDT_DEV}.lfsck_namespace |
561 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
563 error "(7) unexpected status"
566 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
567 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
569 local repaired=$($SHOW_NAMESPACE |
570 awk '/^dirent_repaired/ { print $2 }')
571 # for interop with old server
572 [ -z "$repaired" ] &&
573 repaired=$($SHOW_NAMESPACE |
574 awk '/^updated_phase1/ { print $2 }')
576 [ $repaired -ge 2 ] ||
577 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
579 mount_client $MOUNT || error "(10) Fail to start client!"
581 #define OBD_FAIL_FID_LOOKUP 0x1505
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
583 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
585 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
588 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
589 local dummyname=$($LFS fid2path $DIR $dummyfid)
590 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
591 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
593 run_test 5 "LFSCK can handle IGIF object upgrading"
598 #define OBD_FAIL_LFSCK_DELAY1 0x1600
599 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
600 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
602 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
603 [ "$STATUS" == "scanning-phase1" ] ||
604 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
606 # Sleep 3 sec to guarantee at least one object processed by LFSCK
608 # Fail the LFSCK to guarantee there is at least one checkpoint
609 #define OBD_FAIL_LFSCK_FATAL1 0x1608
610 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
611 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
612 mdd.${MDT_DEV}.lfsck_namespace |
613 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
615 error "(4) unexpected status"
618 local POS0=$($SHOW_NAMESPACE |
619 awk '/^last_checkpoint_position/ { print $2 }' |
622 #define OBD_FAIL_LFSCK_DELAY1 0x1600
623 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
624 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
626 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
627 [ "$STATUS" == "scanning-phase1" ] ||
628 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
630 local POS1=$($SHOW_NAMESPACE |
631 awk '/^latest_start_position/ { print $2 }' |
633 [[ $POS0 -lt $POS1 ]] ||
634 error "(7) Expect larger than: $POS0, but got $POS1"
636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
637 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
638 mdd.${MDT_DEV}.lfsck_namespace |
639 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
641 error "(8) unexpected status"
644 run_test 6a "LFSCK resumes from last checkpoint (1)"
649 #define OBD_FAIL_LFSCK_DELAY2 0x1601
650 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
651 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
653 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
654 [ "$STATUS" == "scanning-phase1" ] ||
655 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
657 # Sleep 5 sec to guarantee that we are in the directory scanning
659 # Fail the LFSCK to guarantee there is at least one checkpoint
660 #define OBD_FAIL_LFSCK_FATAL2 0x1609
661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
662 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
663 mdd.${MDT_DEV}.lfsck_namespace |
664 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
666 error "(4) unexpected status"
669 local O_POS0=$($SHOW_NAMESPACE |
670 awk '/^last_checkpoint_position/ { print $2 }' |
673 local D_POS0=$($SHOW_NAMESPACE |
674 awk '/^last_checkpoint_position/ { print $4 }')
676 #define OBD_FAIL_LFSCK_DELAY2 0x1601
677 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
678 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
680 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
681 [ "$STATUS" == "scanning-phase1" ] ||
682 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
684 local O_POS1=$($SHOW_NAMESPACE |
685 awk '/^latest_start_position/ { print $2 }' |
687 local D_POS1=$($SHOW_NAMESPACE |
688 awk '/^latest_start_position/ { print $4 }')
690 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
691 [[ $O_POS0 -lt $O_POS1 ]] ||
692 error "(7.1) $O_POS1 is not larger than $O_POS0"
694 [[ $D_POS0 -lt $D_POS1 ]] ||
695 error "(7.2) $D_POS1 is not larger than $D_POS0"
698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
699 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
700 mdd.${MDT_DEV}.lfsck_namespace |
701 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
703 error "(8) unexpected status"
706 run_test 6b "LFSCK resumes from last checkpoint (2)"
713 #define OBD_FAIL_LFSCK_DELAY2 0x1601
714 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
715 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
717 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
718 [ "$STATUS" == "scanning-phase1" ] ||
719 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
721 # Sleep 3 sec to guarantee at least one object processed by LFSCK
723 echo "stop $SINGLEMDS"
724 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
727 echo "start $SINGLEMDS"
728 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
729 error "(5) Fail to start MDS!"
731 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
732 mdd.${MDT_DEV}.lfsck_namespace |
733 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
735 error "(6) unexpected status"
738 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
744 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
745 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
746 for ((i = 0; i < 20; i++)); do
747 touch $DIR/$tdir/dummy${i}
750 #define OBD_FAIL_LFSCK_DELAY3 0x1602
751 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
752 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
753 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
754 mdd.${MDT_DEV}.lfsck_namespace |
755 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
757 error "(4) unexpected status"
761 echo "stop $SINGLEMDS"
762 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
764 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
765 echo "start $SINGLEMDS"
766 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
767 error "(6) Fail to start MDS!"
769 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
770 mdd.${MDT_DEV}.lfsck_namespace |
771 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
773 error "(7) unexpected status"
776 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
781 formatall > /dev/null
787 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
788 [ "$STATUS" == "init" ] ||
789 error "(2) Expect 'init', but got '$STATUS'"
791 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
792 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
793 mkdir $DIR/$tdir/crashed
795 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
797 for ((i = 0; i < 5; i++)); do
798 touch $DIR/$tdir/dummy${i}
801 umount_client $MOUNT || error "(3) Fail to stop client!"
803 #define OBD_FAIL_LFSCK_DELAY2 0x1601
804 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
805 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
807 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
808 [ "$STATUS" == "scanning-phase1" ] ||
809 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
811 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
813 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
814 [ "$STATUS" == "stopped" ] ||
815 error "(7) Expect 'stopped', but got '$STATUS'"
817 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
819 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
820 [ "$STATUS" == "scanning-phase1" ] ||
821 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
823 #define OBD_FAIL_LFSCK_FATAL2 0x1609
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
829 error "(10) unexpected status"
832 #define OBD_FAIL_LFSCK_DELAY1 0x1600
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
834 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
836 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
837 [ "$STATUS" == "scanning-phase1" ] ||
838 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
840 #define OBD_FAIL_LFSCK_CRASH 0x160a
841 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
844 echo "stop $SINGLEMDS"
845 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
847 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
848 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
850 echo "start $SINGLEMDS"
851 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
852 error "(14) Fail to start MDS!"
854 local timeout=$(max_recovery_time)
857 while [ $timer -lt $timeout ]; do
858 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
859 mdt.${MDT_DEV}.recovery_status |
860 awk '/^status/ { print \\\$2 }'")
861 [ "$STATUS" != "RECOVERING" ] && break;
866 [ $timer != $timeout ] ||
867 error "(14.1) recovery timeout"
869 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
870 [ "$STATUS" == "crashed" ] ||
871 error "(15) Expect 'crashed', but got '$STATUS'"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
875 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
881 echo "stop $SINGLEMDS"
882 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
884 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
885 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(19) Fail to start MDS!"
892 while [ $timer -lt $timeout ]; do
893 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
894 mdt.${MDT_DEV}.recovery_status |
895 awk '/^status/ { print \\\$2 }'")
896 [ "$STATUS" != "RECOVERING" ] && break;
901 [ $timer != $timeout ] ||
902 error "(19.1) recovery timeout"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "paused" ] ||
906 error "(20) Expect 'paused', but got '$STATUS'"
908 #define OBD_FAIL_LFSCK_DELAY3 0x1602
909 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
911 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
913 mdd.${MDT_DEV}.lfsck_namespace |
914 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
916 error "(22) unexpected status"
919 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
920 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
921 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
924 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
925 mdd.${MDT_DEV}.lfsck_namespace |
926 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
928 error "(24) unexpected status"
931 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
932 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
934 run_test 8 "LFSCK state machine"
937 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
938 skip "Testing on UP system, the speed may be inaccurate."
942 [[ $server_version -ge $(version_code 2.7.50) ]] ||
943 { skip "Need MDS version >= 2.7.50"; return; }
946 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
947 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
948 createmany -o $DIR/$tdir/lfsck/f 5000
950 local BASE_SPEED1=100
952 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
955 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
956 [ "$STATUS" == "scanning-phase1" ] ||
957 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
959 local SPEED=$($SHOW_LAYOUT |
960 awk '/^average_speed_phase1/ { print $2 }')
962 # There may be time error, normally it should be less than 2 seconds.
963 # We allow another 20% schedule error.
965 # MAX_MARGIN = 1.2 = 12 / 10
966 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
967 RUN_TIME1 * 12 / 10))
968 [ $SPEED -lt $MAX_SPEED ] ||
969 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
972 local BASE_SPEED2=300
974 do_facet $SINGLEMDS \
975 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
978 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
979 # MIN_MARGIN = 0.8 = 8 / 10
980 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
981 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
982 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
983 [ $SPEED -gt $MIN_SPEED ] || {
984 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
985 error_ignore LU-5624 \
986 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
989 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
993 # MAX_MARGIN = 1.2 = 12 / 10
994 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
995 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
996 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
997 [ $SPEED -lt $MAX_SPEED ] ||
998 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1000 do_facet $SINGLEMDS \
1001 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1003 wait_update_facet $SINGLEMDS \
1004 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1005 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1006 error "(7) Failed to get expected 'completed'"
1008 run_test 9a "LFSCK speed control (1)"
1011 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1012 skip "Testing on UP system, the speed may be inaccurate."
1016 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1017 { skip "Need MDS version >= 2.7.50"; return; }
1021 echo "Preparing another 50 * 50 files (with error) at $(date)."
1022 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1023 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1024 createmany -d $DIR/$tdir/d 50
1025 createmany -m $DIR/$tdir/f 50
1026 for ((i = 0; i < 50; i++)); do
1027 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1030 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1032 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1034 mdd.${MDT_DEV}.lfsck_namespace |
1035 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1037 error "(5) unexpected status"
1040 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1041 echo "Prepared at $(date)."
1043 local BASE_SPEED1=50
1045 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1048 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1049 [ "$STATUS" == "scanning-phase2" ] ||
1050 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1052 local SPEED=$($SHOW_NAMESPACE |
1053 awk '/^average_speed_phase2/ { print $2 }')
1054 # There may be time error, normally it should be less than 2 seconds.
1055 # We allow another 20% schedule error.
1057 # MAX_MARGIN = 1.2 = 12 / 10
1058 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1059 RUN_TIME1 * 12 / 10))
1060 [ $SPEED -lt $MAX_SPEED ] ||
1061 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1063 # adjust speed limit
1064 local BASE_SPEED2=150
1066 do_facet $SINGLEMDS \
1067 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1070 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1071 # MIN_MARGIN = 0.8 = 8 / 10
1072 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1073 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1074 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1075 [ $SPEED -gt $MIN_SPEED ] || {
1076 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1077 error_ignore LU-5624 \
1078 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1081 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1085 # MAX_MARGIN = 1.2 = 12 / 10
1086 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1087 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1088 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1089 [ $SPEED -lt $MAX_SPEED ] ||
1090 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1092 do_facet $SINGLEMDS \
1093 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1095 mdd.${MDT_DEV}.lfsck_namespace |
1096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1098 error "(11) unexpected status"
1101 run_test 9b "LFSCK speed control (2)"
1105 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1106 skip "lookup(..)/linkea on ZFS issue" && return
1110 echo "Preparing more files with error at $(date)."
1111 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1112 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1114 for ((i = 0; i < 1000; i = $((i+2)))); do
1115 mkdir -p $DIR/$tdir/d${i}
1116 touch $DIR/$tdir/f${i}
1117 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1120 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1121 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1123 for ((i = 1; i < 1000; i = $((i+2)))); do
1124 mkdir -p $DIR/$tdir/d${i}
1125 touch $DIR/$tdir/f${i}
1126 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1129 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1130 echo "Prepared at $(date)."
1132 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1134 umount_client $MOUNT
1135 mount_client $MOUNT || error "(3) Fail to start client!"
1137 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1140 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1141 [ "$STATUS" == "scanning-phase1" ] ||
1142 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1144 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1146 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1148 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1150 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1152 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1154 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1156 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1158 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1159 error "(14) Fail to softlink!"
1161 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1162 [ "$STATUS" == "scanning-phase1" ] ||
1163 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1165 do_facet $SINGLEMDS \
1166 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1167 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1168 mdd.${MDT_DEV}.lfsck_namespace |
1169 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1171 error "(16) unexpected status"
1174 run_test 10 "System is available during LFSCK scanning"
1177 ost_remove_lastid() {
1180 local rcmd="do_facet ost${ost}"
1182 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1184 # step 1: local mount
1185 mount_fstype ost${ost} || return 1
1186 # step 2: remove the specified LAST_ID
1187 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1189 unmount_fstype ost${ost} || return 2
1193 check_mount_and_prep
1194 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1195 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1200 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1202 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1203 error "(2) Fail to start ost1"
1205 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1206 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1208 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1209 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1211 wait_update_facet ost1 "$LCTL get_param -n \
1212 obdfilter.${OST_DEV}.lfsck_layout |
1213 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1215 error "(5) unexpected status"
1218 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1220 wait_update_facet ost1 "$LCTL get_param -n \
1221 obdfilter.${OST_DEV}.lfsck_layout |
1222 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1224 error "(6) unexpected status"
1227 echo "the LAST_ID(s) should have been rebuilt"
1228 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1229 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1231 run_test 11a "LFSCK can rebuild lost last_id"
1234 check_mount_and_prep
1235 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1237 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1238 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1239 do_facet ost1 $LCTL set_param fail_loc=0x160d
1241 local count=$(precreated_ost_obj_count 0 0)
1243 createmany -o $DIR/$tdir/f $((count + 32))
1245 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1246 local seq=$(do_facet mds1 $LCTL get_param -n \
1247 osp.${proc_path}.prealloc_last_seq)
1248 local lastid1=$(do_facet ost1 "lctl get_param -n \
1249 obdfilter.${ost1_svc}.last_id" | grep $seq |
1250 awk -F: '{ print $2 }')
1252 umount_client $MOUNT
1253 stop ost1 || error "(1) Fail to stop ost1"
1255 #define OBD_FAIL_OST_ENOSPC 0x215
1256 do_facet ost1 $LCTL set_param fail_loc=0x215
1258 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1259 error "(2) Fail to start ost1"
1261 for ((i = 0; i < 60; i++)); do
1262 lastid2=$(do_facet ost1 "lctl get_param -n \
1263 obdfilter.${ost1_svc}.last_id" | grep $seq |
1264 awk -F: '{ print $2 }')
1265 [ ! -z $lastid2 ] && break;
1269 echo "the on-disk LAST_ID should be smaller than the expected one"
1270 [ $lastid1 -gt $lastid2 ] ||
1271 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1273 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1274 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1276 wait_update_facet ost1 "$LCTL get_param -n \
1277 obdfilter.${OST_DEV}.lfsck_layout |
1278 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1280 error "(6) unexpected status"
1283 stop ost1 || error "(7) Fail to stop ost1"
1285 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1286 error "(8) Fail to start ost1"
1288 echo "the on-disk LAST_ID should have been rebuilt"
1289 wait_update_facet ost1 "$LCTL get_param -n \
1290 obdfilter.${ost1_svc}.last_id | grep $seq |
1291 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1292 do_facet ost1 $LCTL get_param -n \
1293 obdfilter.${ost1_svc}.last_id
1294 error "(9) expect lastid1 $seq:$lastid1"
1297 do_facet ost1 $LCTL set_param fail_loc=0
1298 stopall || error "(10) Fail to stopall"
1300 run_test 11b "LFSCK can rebuild crashed last_id"
1303 [ $MDSCOUNT -lt 2 ] &&
1304 skip "We need at least 2 MDSes for test_12" && return
1306 check_mount_and_prep
1307 for k in $(seq $MDSCOUNT); do
1308 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1309 createmany -o $DIR/$tdir/${k}/f 100 ||
1310 error "(0) Fail to create 100 files."
1313 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1314 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1315 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1317 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1318 for k in $(seq $MDSCOUNT); do
1319 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1320 mdd.$(facet_svc mds${k}).lfsck_namespace |
1321 awk '/^status/ { print $2 }')
1322 [ "$STATUS" == "scanning-phase1" ] ||
1323 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1326 echo "Stop namespace LFSCK on all targets by single lctl command."
1327 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1328 error "(4) Fail to stop LFSCK on all devices!"
1330 echo "All the LFSCK targets should be in 'stopped' status."
1331 for k in $(seq $MDSCOUNT); do
1332 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1333 mdd.$(facet_svc mds${k}).lfsck_namespace |
1334 awk '/^status/ { print $2 }')
1335 [ "$STATUS" == "stopped" ] ||
1336 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1339 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1340 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1341 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1343 echo "All the LFSCK targets should be in 'completed' status."
1344 for k in $(seq $MDSCOUNT); do
1345 wait_update_facet mds${k} "$LCTL get_param -n \
1346 mdd.$(facet_svc mds${k}).lfsck_namespace |
1347 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1348 error "(7) MDS${k} is not the expected 'completed'"
1351 start_full_debug_logging
1353 echo "Start layout LFSCK on all targets by single command (-s 1)."
1354 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1355 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1357 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1358 for k in $(seq $MDSCOUNT); do
1359 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1360 mdd.$(facet_svc mds${k}).lfsck_layout |
1361 awk '/^status/ { print $2 }')
1362 [ "$STATUS" == "scanning-phase1" ] ||
1363 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1366 echo "Stop layout LFSCK on all targets by single lctl command."
1367 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1368 error "(10) Fail to stop LFSCK on all devices!"
1370 echo "All the LFSCK targets should be in 'stopped' status."
1371 for k in $(seq $MDSCOUNT); do
1372 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1373 mdd.$(facet_svc mds${k}).lfsck_layout |
1374 awk '/^status/ { print $2 }')
1375 [ "$STATUS" == "stopped" ] ||
1376 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1379 for k in $(seq $OSTCOUNT); do
1380 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1381 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1382 awk '/^status/ { print $2 }')
1383 [ "$STATUS" == "stopped" ] ||
1384 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1387 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1388 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1389 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1391 echo "All the LFSCK targets should be in 'completed' status."
1392 for k in $(seq $MDSCOUNT); do
1393 # The LFSCK status query internal is 30 seconds. For the case
1394 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1395 # time to guarantee the status sync up.
1396 wait_update_facet mds${k} "$LCTL get_param -n \
1397 mdd.$(facet_svc mds${k}).lfsck_layout |
1398 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1399 error "(14) MDS${k} is not the expected 'completed'"
1402 stop_full_debug_logging
1404 run_test 12 "single command to trigger LFSCK on all devices"
1408 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1409 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1410 echo "MDT-object FID."
1413 check_mount_and_prep
1415 echo "Inject failure stub to simulate bad lmm_oi"
1416 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1417 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1418 createmany -o $DIR/$tdir/f 32
1419 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1421 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1422 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1424 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1425 mdd.${MDT_DEV}.lfsck_layout |
1426 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1428 error "(2) unexpected status"
1431 local repaired=$($SHOW_LAYOUT |
1432 awk '/^repaired_others/ { print $2 }')
1433 [ $repaired -eq 32 ] ||
1434 error "(3) Fail to repair crashed lmm_oi: $repaired"
1436 run_test 13 "LFSCK can repair crashed lmm_oi"
1440 echo "The OST-object referenced by the MDT-object should be there;"
1441 echo "otherwise, the LFSCK should re-create the missing OST-object."
1444 check_mount_and_prep
1445 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1447 echo "Inject failure stub to simulate dangling referenced MDT-object"
1448 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1449 do_facet ost1 $LCTL set_param fail_loc=0x1610
1450 local count=$(precreated_ost_obj_count 0 0)
1452 createmany -o $DIR/$tdir/f $((count + 31))
1453 touch $DIR/$tdir/guard
1454 do_facet ost1 $LCTL set_param fail_loc=0
1456 start_full_debug_logging
1458 # exhaust other pre-created dangling cases
1459 count=$(precreated_ost_obj_count 0 0)
1460 createmany -o $DIR/$tdir/a $count ||
1461 error "(0) Fail to create $count files."
1463 echo "'ls' should fail because of dangling referenced MDT-object"
1464 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1466 echo "Trigger layout LFSCK to find out dangling reference"
1467 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1469 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1470 mdd.${MDT_DEV}.lfsck_layout |
1471 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1473 error "(3) unexpected status"
1476 local repaired=$($SHOW_LAYOUT |
1477 awk '/^repaired_dangling/ { print $2 }')
1478 [ $repaired -ge 32 ] ||
1479 error "(4) Fail to repair dangling reference: $repaired"
1481 echo "'stat' should fail because of not repair dangling by default"
1482 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1484 echo "Trigger layout LFSCK to repair dangling reference"
1485 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1487 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1488 mdd.${MDT_DEV}.lfsck_layout |
1489 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1491 error "(7) unexpected status"
1494 # There may be some async LFSCK updates in processing, wait for
1495 # a while until the target reparation has been done. LU-4970.
1497 echo "'stat' should success after layout LFSCK repairing"
1498 wait_update_facet client "stat $DIR/$tdir/guard |
1499 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1500 stat $DIR/$tdir/guard
1502 error "(8) unexpected size"
1505 repaired=$($SHOW_LAYOUT |
1506 awk '/^repaired_dangling/ { print $2 }')
1507 [ $repaired -ge 32 ] ||
1508 error "(9) Fail to repair dangling reference: $repaired"
1510 stop_full_debug_logging
1512 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1516 echo "If the OST-object referenced by the MDT-object back points"
1517 echo "to some non-exist MDT-object, then the LFSCK should repair"
1518 echo "the OST-object to back point to the right MDT-object."
1521 check_mount_and_prep
1522 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1524 echo "Inject failure stub to make the OST-object to back point to"
1525 echo "non-exist MDT-object."
1526 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1528 do_facet ost1 $LCTL set_param fail_loc=0x1611
1529 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1530 cancel_lru_locks osc
1531 do_facet ost1 $LCTL set_param fail_loc=0
1533 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1534 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1537 mdd.${MDT_DEV}.lfsck_layout |
1538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1540 error "(2) unexpected status"
1543 local repaired=$($SHOW_LAYOUT |
1544 awk '/^repaired_unmatched_pair/ { print $2 }')
1545 [ $repaired -eq 1 ] ||
1546 error "(3) Fail to repair unmatched pair: $repaired"
1548 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1552 echo "If the OST-object referenced by the MDT-object back points"
1553 echo "to other MDT-object that doesn't recognize the OST-object,"
1554 echo "then the LFSCK should repair it to back point to the right"
1555 echo "MDT-object (the first one)."
1558 check_mount_and_prep
1559 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1560 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1561 cancel_lru_locks osc
1563 echo "Inject failure stub to make the OST-object to back point to"
1564 echo "other MDT-object"
1566 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1567 do_facet ost1 $LCTL set_param fail_loc=0x1612
1568 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1569 cancel_lru_locks osc
1570 do_facet ost1 $LCTL set_param fail_loc=0
1572 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1573 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1575 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1576 mdd.${MDT_DEV}.lfsck_layout |
1577 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1579 error "(2) unexpected status"
1582 local repaired=$($SHOW_LAYOUT |
1583 awk '/^repaired_unmatched_pair/ { print $2 }')
1584 [ $repaired -eq 1 ] ||
1585 error "(3) Fail to repair unmatched pair: $repaired"
1587 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1590 [ $MDSCOUNT -lt 2 ] &&
1591 skip "We need at least 2 MDSes for this test" && return
1593 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1594 skip "Skip the test after 2.7.55 see LU-6437" && return
1597 echo "According to current metadata migration implementation,"
1598 echo "before the old MDT-object is removed, both the new MDT-object"
1599 echo "and old MDT-object will reference the same LOV layout. Then if"
1600 echo "the layout LFSCK finds the new MDT-object by race, it will"
1601 echo "regard related OST-object(s) as multiple referenced case, and"
1602 echo "will try to create new OST-object(s) for the new MDT-object."
1603 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1604 echo "MDT-object before confirm the multiple referenced case."
1607 check_mount_and_prep
1608 $LFS mkdir -i 1 $DIR/$tdir/a1
1609 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1610 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1611 cancel_lru_locks osc
1613 echo "Inject failure stub on MDT1 to delay the migration"
1615 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1616 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1617 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1618 $LFS migrate -m 0 $DIR/$tdir/a1 &
1621 echo "Trigger layout LFSCK to race with the migration"
1622 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1624 for k in $(seq $MDSCOUNT); do
1625 # The LFSCK status query internal is 30 seconds. For the case
1626 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1627 # time to guarantee the status sync up.
1628 wait_update_facet mds${k} "$LCTL get_param -n \
1629 mdd.$(facet_svc mds${k}).lfsck_layout |
1630 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1631 error "(2) MDS${k} is not the expected 'completed'"
1634 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1635 local repaired=$($SHOW_LAYOUT |
1636 awk '/^repaired_unmatched_pair/ { print $2 }')
1637 [ $repaired -eq 1 ] ||
1638 error "(3) Fail to repair unmatched pair: $repaired"
1640 repaired=$($SHOW_LAYOUT |
1641 awk '/^repaired_multiple_referenced/ { print $2 }')
1642 [ $repaired -eq 0 ] ||
1643 error "(4) Unexpectedly repaird multiple references: $repaired"
1645 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1649 echo "If the OST-object's owner information does not match the owner"
1650 echo "information stored in the MDT-object, then the LFSCK trust the"
1651 echo "MDT-object and update the OST-object's owner information."
1654 check_mount_and_prep
1655 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1656 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1657 cancel_lru_locks osc
1659 echo "Inject failure stub to skip OST-object owner changing"
1660 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1662 chown 1.1 $DIR/$tdir/f0
1663 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1665 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1668 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1671 mdd.${MDT_DEV}.lfsck_layout |
1672 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1674 error "(2) unexpected status"
1677 local repaired=$($SHOW_LAYOUT |
1678 awk '/^repaired_inconsistent_owner/ { print $2 }')
1679 [ $repaired -eq 1 ] ||
1680 error "(3) Fail to repair inconsistent owner: $repaired"
1682 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1686 echo "If more than one MDT-objects reference the same OST-object,"
1687 echo "and the OST-object only recognizes one MDT-object, then the"
1688 echo "LFSCK should create new OST-objects for such non-recognized"
1692 check_mount_and_prep
1693 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1695 echo "Inject failure stub to make two MDT-objects to refernce"
1696 echo "the OST-object"
1698 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1699 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1701 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1702 cancel_lru_locks osc
1704 createmany -o $DIR/$tdir/f 1
1706 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1708 cancel_lru_locks mdc
1709 cancel_lru_locks osc
1711 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1712 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1713 [ $size -eq 1048576 ] ||
1714 error "(1) f0 (wrong) size should be 1048576, but got $size"
1716 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1719 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1722 mdd.${MDT_DEV}.lfsck_layout |
1723 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1725 error "(3) unexpected status"
1728 local repaired=$($SHOW_LAYOUT |
1729 awk '/^repaired_multiple_referenced/ { print $2 }')
1730 [ $repaired -eq 1 ] ||
1731 error "(4) Fail to repair multiple references: $repaired"
1733 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1734 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1735 error "(5) Fail to write f0."
1736 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1737 [ $size -eq 1048576 ] ||
1738 error "(6) guard size should be 1048576, but got $size"
1740 run_test 17 "LFSCK can repair multiple references"
1742 $LCTL set_param debug=+cache > /dev/null
1746 echo "The target MDT-object is there, but related stripe information"
1747 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1748 echo "layout EA entries."
1751 check_mount_and_prep
1752 $LFS mkdir -i 0 $DIR/$tdir/a1
1753 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1754 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1756 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1758 $LFS path2fid $DIR/$tdir/a1/f1
1759 $LFS getstripe $DIR/$tdir/a1/f1
1761 if [ $MDSCOUNT -ge 2 ]; then
1762 $LFS mkdir -i 1 $DIR/$tdir/a2
1763 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1764 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1765 $LFS path2fid $DIR/$tdir/a2/f2
1766 $LFS getstripe $DIR/$tdir/a2/f2
1769 cancel_lru_locks osc
1771 echo "Inject failure, to make the MDT-object lost its layout EA"
1772 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1773 do_facet mds1 $LCTL set_param fail_loc=0x1615
1774 chown 1.1 $DIR/$tdir/a1/f1
1776 if [ $MDSCOUNT -ge 2 ]; then
1777 do_facet mds2 $LCTL set_param fail_loc=0x1615
1778 chown 1.1 $DIR/$tdir/a2/f2
1784 do_facet mds1 $LCTL set_param fail_loc=0
1785 if [ $MDSCOUNT -ge 2 ]; then
1786 do_facet mds2 $LCTL set_param fail_loc=0
1789 cancel_lru_locks mdc
1790 cancel_lru_locks osc
1792 echo "The file size should be incorrect since layout EA is lost"
1793 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1794 [ "$cur_size" != "$saved_size" ] ||
1795 error "(1) Expect incorrect file1 size"
1797 if [ $MDSCOUNT -ge 2 ]; then
1798 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1799 [ "$cur_size" != "$saved_size" ] ||
1800 error "(2) Expect incorrect file2 size"
1803 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1804 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1806 for k in $(seq $MDSCOUNT); do
1807 # The LFSCK status query internal is 30 seconds. For the case
1808 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1809 # time to guarantee the status sync up.
1810 wait_update_facet mds${k} "$LCTL get_param -n \
1811 mdd.$(facet_svc mds${k}).lfsck_layout |
1812 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1813 error "(4) MDS${k} is not the expected 'completed'"
1816 for k in $(seq $OSTCOUNT); do
1817 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1818 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1819 awk '/^status/ { print $2 }')
1820 [ "$cur_status" == "completed" ] ||
1821 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1824 local repaired=$(do_facet mds1 $LCTL get_param -n \
1825 mdd.$(facet_svc mds1).lfsck_layout |
1826 awk '/^repaired_orphan/ { print $2 }')
1827 [ $repaired -eq 1 ] ||
1828 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1830 if [ $MDSCOUNT -ge 2 ]; then
1831 repaired=$(do_facet mds2 $LCTL get_param -n \
1832 mdd.$(facet_svc mds2).lfsck_layout |
1833 awk '/^repaired_orphan/ { print $2 }')
1834 [ $repaired -eq 2 ] ||
1835 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1838 $LFS path2fid $DIR/$tdir/a1/f1
1839 $LFS getstripe $DIR/$tdir/a1/f1
1841 if [ $MDSCOUNT -ge 2 ]; then
1842 $LFS path2fid $DIR/$tdir/a2/f2
1843 $LFS getstripe $DIR/$tdir/a2/f2
1846 echo "The file size should be correct after layout LFSCK scanning"
1847 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1848 [ "$cur_size" == "$saved_size" ] ||
1849 error "(7) Expect file1 size $saved_size, but got $cur_size"
1851 if [ $MDSCOUNT -ge 2 ]; then
1852 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1853 [ "$cur_size" == "$saved_size" ] ||
1854 error "(8) Expect file2 size $saved_size, but got $cur_size"
1857 run_test 18a "Find out orphan OST-object and repair it (1)"
1861 echo "The target MDT-object is lost. The LFSCK should re-create the"
1862 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1863 echo "can move it back to normal namespace manually."
1866 check_mount_and_prep
1867 $LFS mkdir -i 0 $DIR/$tdir/a1
1868 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1869 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1870 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1871 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1873 $LFS getstripe $DIR/$tdir/a1/f1
1875 if [ $MDSCOUNT -ge 2 ]; then
1876 $LFS mkdir -i 1 $DIR/$tdir/a2
1877 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1878 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1879 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1881 $LFS getstripe $DIR/$tdir/a2/f2
1884 cancel_lru_locks osc
1886 echo "Inject failure, to simulate the case of missing the MDT-object"
1887 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1888 do_facet mds1 $LCTL set_param fail_loc=0x1616
1889 rm -f $DIR/$tdir/a1/f1
1891 if [ $MDSCOUNT -ge 2 ]; then
1892 do_facet mds2 $LCTL set_param fail_loc=0x1616
1893 rm -f $DIR/$tdir/a2/f2
1899 do_facet mds1 $LCTL set_param fail_loc=0
1900 if [ $MDSCOUNT -ge 2 ]; then
1901 do_facet mds2 $LCTL set_param fail_loc=0
1904 cancel_lru_locks mdc
1905 cancel_lru_locks osc
1907 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1908 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1910 for k in $(seq $MDSCOUNT); do
1911 # The LFSCK status query internal is 30 seconds. For the case
1912 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1913 # time to guarantee the status sync up.
1914 wait_update_facet mds${k} "$LCTL get_param -n \
1915 mdd.$(facet_svc mds${k}).lfsck_layout |
1916 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1917 error "(2) MDS${k} is not the expected 'completed'"
1920 for k in $(seq $OSTCOUNT); do
1921 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1922 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1923 awk '/^status/ { print $2 }')
1924 [ "$cur_status" == "completed" ] ||
1925 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1928 local repaired=$(do_facet mds1 $LCTL get_param -n \
1929 mdd.$(facet_svc mds1).lfsck_layout |
1930 awk '/^repaired_orphan/ { print $2 }')
1931 [ $repaired -eq 1 ] ||
1932 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1934 if [ $MDSCOUNT -ge 2 ]; then
1935 repaired=$(do_facet mds2 $LCTL get_param -n \
1936 mdd.$(facet_svc mds2).lfsck_layout |
1937 awk '/^repaired_orphan/ { print $2 }')
1938 [ $repaired -eq 2 ] ||
1939 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1942 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1943 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1944 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1946 if [ $MDSCOUNT -ge 2 ]; then
1947 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1948 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1951 $LFS path2fid $DIR/$tdir/a1/f1
1952 $LFS getstripe $DIR/$tdir/a1/f1
1954 if [ $MDSCOUNT -ge 2 ]; then
1955 $LFS path2fid $DIR/$tdir/a2/f2
1956 $LFS getstripe $DIR/$tdir/a2/f2
1959 echo "The file size should be correct after layout LFSCK scanning"
1960 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1961 [ "$cur_size" == "$saved_size" ] ||
1962 error "(7) Expect file1 size $saved_size, but got $cur_size"
1964 if [ $MDSCOUNT -ge 2 ]; then
1965 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1966 [ "$cur_size" == "$saved_size" ] ||
1967 error "(8) Expect file2 size $saved_size, but got $cur_size"
1970 run_test 18b "Find out orphan OST-object and repair it (2)"
1974 echo "The target MDT-object is lost, and the OST-object FID is missing."
1975 echo "The LFSCK should re-create the MDT-object with new FID under the "
1976 echo "directory .lustre/lost+found/MDTxxxx."
1979 check_mount_and_prep
1980 $LFS mkdir -i 0 $DIR/$tdir/a1
1981 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1983 echo "Inject failure, to simulate the case of missing parent FID"
1984 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1985 do_facet ost1 $LCTL set_param fail_loc=0x1617
1987 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1988 $LFS getstripe $DIR/$tdir/a1/f1
1990 if [ $MDSCOUNT -ge 2 ]; then
1991 $LFS mkdir -i 1 $DIR/$tdir/a2
1992 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1993 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1994 $LFS getstripe $DIR/$tdir/a2/f2
1997 cancel_lru_locks osc
1999 echo "Inject failure, to simulate the case of missing the MDT-object"
2000 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2001 do_facet mds1 $LCTL set_param fail_loc=0x1616
2002 rm -f $DIR/$tdir/a1/f1
2004 if [ $MDSCOUNT -ge 2 ]; then
2005 do_facet mds2 $LCTL set_param fail_loc=0x1616
2006 rm -f $DIR/$tdir/a2/f2
2012 do_facet mds1 $LCTL set_param fail_loc=0
2013 if [ $MDSCOUNT -ge 2 ]; then
2014 do_facet mds2 $LCTL set_param fail_loc=0
2017 cancel_lru_locks mdc
2018 cancel_lru_locks osc
2020 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2021 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2023 for k in $(seq $MDSCOUNT); do
2024 # The LFSCK status query internal is 30 seconds. For the case
2025 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2026 # time to guarantee the status sync up.
2027 wait_update_facet mds${k} "$LCTL get_param -n \
2028 mdd.$(facet_svc mds${k}).lfsck_layout |
2029 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2030 error "(2) MDS${k} is not the expected 'completed'"
2033 for k in $(seq $OSTCOUNT); do
2034 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2035 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2036 awk '/^status/ { print $2 }')
2037 [ "$cur_status" == "completed" ] ||
2038 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2041 if [ $MDSCOUNT -ge 2 ]; then
2047 local repaired=$(do_facet mds1 $LCTL get_param -n \
2048 mdd.$(facet_svc mds1).lfsck_layout |
2049 awk '/^repaired_orphan/ { print $2 }')
2050 [ $repaired -eq $expected ] ||
2051 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2053 if [ $MDSCOUNT -ge 2 ]; then
2054 repaired=$(do_facet mds2 $LCTL get_param -n \
2055 mdd.$(facet_svc mds2).lfsck_layout |
2056 awk '/^repaired_orphan/ { print $2 }')
2057 [ $repaired -eq 0 ] ||
2058 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2061 ls -ail $MOUNT/.lustre/lost+found/
2063 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2064 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2065 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2067 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2070 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2071 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2072 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2074 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2075 [ ! -z "$cname" ] ||
2076 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2078 run_test 18c "Find out orphan OST-object and repair it (3)"
2082 echo "The target MDT-object layout EA slot is occpuied by some new"
2083 echo "created OST-object when repair dangling reference case. Such"
2084 echo "conflict OST-object has never been modified. Then when found"
2085 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2089 check_mount_and_prep
2091 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2092 echo "guard" > $DIR/$tdir/a1/f1
2093 echo "foo" > $DIR/$tdir/a1/f2
2094 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2095 $LFS path2fid $DIR/$tdir/a1/f1
2096 $LFS getstripe $DIR/$tdir/a1/f1
2097 $LFS path2fid $DIR/$tdir/a1/f2
2098 $LFS getstripe $DIR/$tdir/a1/f2
2099 cancel_lru_locks osc
2101 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2102 echo "to reference the same OST-object (which is f1's OST-obejct)."
2103 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2104 echo "dangling reference case, but f2's old OST-object is there."
2107 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2108 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2109 chown 1.1 $DIR/$tdir/a1/f2
2110 rm -f $DIR/$tdir/a1/f1
2113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2115 echo "stopall to cleanup object cache"
2118 setupall > /dev/null
2120 echo "The file size should be incorrect since dangling referenced"
2121 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2122 [ "$cur_size" != "$saved_size" ] ||
2123 error "(1) Expect incorrect file2 size"
2125 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2126 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2128 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2129 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2131 wait_update_facet mds1 "$LCTL get_param -n \
2132 mdd.$(facet_svc mds1).lfsck_layout |
2133 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2134 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2136 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2138 for k in $(seq $MDSCOUNT); do
2139 # The LFSCK status query internal is 30 seconds. For the case
2140 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2141 # time to guarantee the status sync up.
2142 wait_update_facet mds${k} "$LCTL get_param -n \
2143 mdd.$(facet_svc mds${k}).lfsck_layout |
2144 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2145 error "(3) MDS${k} is not the expected 'completed'"
2148 for k in $(seq $OSTCOUNT); do
2149 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2150 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2151 awk '/^status/ { print $2 }')
2152 [ "$cur_status" == "completed" ] ||
2153 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2156 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2157 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2158 awk '/^repaired_orphan/ { print $2 }')
2159 [ $repaired -eq 1 ] ||
2160 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2162 echo "The file size should be correct after layout LFSCK scanning"
2163 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2164 [ "$cur_size" == "$saved_size" ] ||
2165 error "(6) Expect file2 size $saved_size, but got $cur_size"
2167 echo "The LFSCK should find back the original data."
2168 cat $DIR/$tdir/a1/f2
2169 $LFS path2fid $DIR/$tdir/a1/f2
2170 $LFS getstripe $DIR/$tdir/a1/f2
2172 run_test 18d "Find out orphan OST-object and repair it (4)"
2176 echo "The target MDT-object layout EA slot is occpuied by some new"
2177 echo "created OST-object when repair dangling reference case. Such"
2178 echo "conflict OST-object has been modified by others. To keep the"
2179 echo "new data, the LFSCK will create a new file to refernece this"
2180 echo "old orphan OST-object."
2183 check_mount_and_prep
2185 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2186 echo "guard" > $DIR/$tdir/a1/f1
2187 echo "foo" > $DIR/$tdir/a1/f2
2188 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2189 $LFS path2fid $DIR/$tdir/a1/f1
2190 $LFS getstripe $DIR/$tdir/a1/f1
2191 $LFS path2fid $DIR/$tdir/a1/f2
2192 $LFS getstripe $DIR/$tdir/a1/f2
2193 cancel_lru_locks osc
2195 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2196 echo "to reference the same OST-object (which is f1's OST-obejct)."
2197 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2198 echo "dangling reference case, but f2's old OST-object is there."
2201 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2202 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2203 chown 1.1 $DIR/$tdir/a1/f2
2204 rm -f $DIR/$tdir/a1/f1
2207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2209 echo "stopall to cleanup object cache"
2212 setupall > /dev/null
2214 echo "The file size should be incorrect since dangling referenced"
2215 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2216 [ "$cur_size" != "$saved_size" ] ||
2217 error "(1) Expect incorrect file2 size"
2219 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2220 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2222 start_full_debug_logging
2224 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2225 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2227 wait_update_facet mds1 "$LCTL get_param -n \
2228 mdd.$(facet_svc mds1).lfsck_layout |
2229 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2230 error "(3) MDS1 is not the expected 'scanning-phase2'"
2232 # to guarantee all updates are synced.
2236 echo "Write new data to f2 to modify the new created OST-object."
2237 echo "dummy" >> $DIR/$tdir/a1/f2
2239 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2241 for k in $(seq $MDSCOUNT); do
2242 # The LFSCK status query internal is 30 seconds. For the case
2243 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2244 # time to guarantee the status sync up.
2245 wait_update_facet mds${k} "$LCTL get_param -n \
2246 mdd.$(facet_svc mds${k}).lfsck_layout |
2247 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2248 error "(4) MDS${k} is not the expected 'completed'"
2251 for k in $(seq $OSTCOUNT); do
2252 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2253 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2254 awk '/^status/ { print $2 }')
2255 [ "$cur_status" == "completed" ] ||
2256 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2259 stop_full_debug_logging
2261 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2262 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2263 awk '/^repaired_orphan/ { print $2 }')
2264 [ $repaired -eq 1 ] ||
2265 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2267 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2268 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2269 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2271 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2272 [ ! -z "$cname" ] ||
2273 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2275 echo "The stub file should keep the original f2 data"
2276 cur_size=$(ls -il $cname | awk '{ print $6 }')
2277 [ "$cur_size" == "$saved_size" ] ||
2278 error "(9) Expect file2 size $saved_size, but got $cur_size"
2281 $LFS path2fid $cname
2282 $LFS getstripe $cname
2284 echo "The f2 should contains new data."
2285 cat $DIR/$tdir/a1/f2
2286 $LFS path2fid $DIR/$tdir/a1/f2
2287 $LFS getstripe $DIR/$tdir/a1/f2
2289 run_test 18e "Find out orphan OST-object and repair it (5)"
2292 [ $OSTCOUNT -lt 2 ] &&
2293 skip "The test needs at least 2 OSTs" && return
2296 echo "The target MDT-object is lost. The LFSCK should re-create the"
2297 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2298 echo "to verify some OST-object(s) during the first stage-scanning,"
2299 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2300 echo "should not be affected."
2303 check_mount_and_prep
2304 $LFS mkdir -i 0 $DIR/$tdir/a1
2305 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2306 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2307 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2308 $LFS mkdir -i 0 $DIR/$tdir/a2
2309 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2310 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2311 $LFS getstripe $DIR/$tdir/a1/f1
2312 $LFS getstripe $DIR/$tdir/a2/f2
2314 if [ $MDSCOUNT -ge 2 ]; then
2315 $LFS mkdir -i 1 $DIR/$tdir/a3
2316 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2317 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2318 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2319 $LFS mkdir -i 1 $DIR/$tdir/a4
2320 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2321 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2322 $LFS getstripe $DIR/$tdir/a3/f3
2323 $LFS getstripe $DIR/$tdir/a4/f4
2326 cancel_lru_locks osc
2328 echo "Inject failure, to simulate the case of missing the MDT-object"
2329 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2330 do_facet mds1 $LCTL set_param fail_loc=0x1616
2331 rm -f $DIR/$tdir/a1/f1
2332 rm -f $DIR/$tdir/a2/f2
2334 if [ $MDSCOUNT -ge 2 ]; then
2335 do_facet mds2 $LCTL set_param fail_loc=0x1616
2336 rm -f $DIR/$tdir/a3/f3
2337 rm -f $DIR/$tdir/a4/f4
2343 do_facet mds1 $LCTL set_param fail_loc=0
2344 if [ $MDSCOUNT -ge 2 ]; then
2345 do_facet mds2 $LCTL set_param fail_loc=0
2348 cancel_lru_locks mdc
2349 cancel_lru_locks osc
2351 echo "Inject failure, to simulate the OST0 fail to handle"
2352 echo "MDT0 LFSCK request during the first-stage scanning."
2353 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2354 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2356 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2357 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2359 for k in $(seq $MDSCOUNT); do
2360 # The LFSCK status query internal is 30 seconds. For the case
2361 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2362 # time to guarantee the status sync up.
2363 wait_update_facet mds${k} "$LCTL get_param -n \
2364 mdd.$(facet_svc mds${k}).lfsck_layout |
2365 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2366 error "(2) MDS${k} is not the expected 'partial'"
2369 wait_update_facet ost1 "$LCTL get_param -n \
2370 obdfilter.$(facet_svc ost1).lfsck_layout |
2371 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2372 error "(3) OST1 is not the expected 'partial'"
2375 wait_update_facet ost2 "$LCTL get_param -n \
2376 obdfilter.$(facet_svc ost2).lfsck_layout |
2377 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2378 error "(4) OST2 is not the expected 'completed'"
2381 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2383 local repaired=$(do_facet mds1 $LCTL get_param -n \
2384 mdd.$(facet_svc mds1).lfsck_layout |
2385 awk '/^repaired_orphan/ { print $2 }')
2386 [ $repaired -eq 1 ] ||
2387 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2389 if [ $MDSCOUNT -ge 2 ]; then
2390 repaired=$(do_facet mds2 $LCTL get_param -n \
2391 mdd.$(facet_svc mds2).lfsck_layout |
2392 awk '/^repaired_orphan/ { print $2 }')
2393 [ $repaired -eq 1 ] ||
2394 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2397 echo "Trigger layout LFSCK on all devices again to cleanup"
2398 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2400 for k in $(seq $MDSCOUNT); do
2401 # The LFSCK status query internal is 30 seconds. For the case
2402 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2403 # time to guarantee the status sync up.
2404 wait_update_facet mds${k} "$LCTL get_param -n \
2405 mdd.$(facet_svc mds${k}).lfsck_layout |
2406 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2407 error "(8) MDS${k} is not the expected 'completed'"
2410 for k in $(seq $OSTCOUNT); do
2411 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2412 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2413 awk '/^status/ { print $2 }')
2414 [ "$cur_status" == "completed" ] ||
2415 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2419 local repaired=$(do_facet mds1 $LCTL get_param -n \
2420 mdd.$(facet_svc mds1).lfsck_layout |
2421 awk '/^repaired_orphan/ { print $2 }')
2422 [ $repaired -eq 2 ] ||
2423 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2425 if [ $MDSCOUNT -ge 2 ]; then
2426 repaired=$(do_facet mds2 $LCTL get_param -n \
2427 mdd.$(facet_svc mds2).lfsck_layout |
2428 awk '/^repaired_orphan/ { print $2 }')
2429 [ $repaired -eq 2 ] ||
2430 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2433 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2435 $LCTL set_param debug=-cache > /dev/null
2438 check_mount_and_prep
2439 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2441 echo "foo" > $DIR/$tdir/a0
2442 echo "guard" > $DIR/$tdir/a1
2443 cancel_lru_locks osc
2445 echo "Inject failure, then client will offer wrong parent FID when read"
2446 do_facet ost1 $LCTL set_param -n \
2447 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2448 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2449 $LCTL set_param fail_loc=0x1619
2451 echo "Read RPC with wrong parent FID should be denied"
2452 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2453 $LCTL set_param fail_loc=0
2455 run_test 19a "OST-object inconsistency self detect"
2458 check_mount_and_prep
2459 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2461 echo "Inject failure stub to make the OST-object to back point to"
2462 echo "non-exist MDT-object"
2464 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2465 do_facet ost1 $LCTL set_param fail_loc=0x1611
2466 echo "foo" > $DIR/$tdir/f0
2467 cancel_lru_locks osc
2468 do_facet ost1 $LCTL set_param fail_loc=0
2470 echo "Nothing should be fixed since self detect and repair is disabled"
2471 local repaired=$(do_facet ost1 $LCTL get_param -n \
2472 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2473 awk '/^repaired/ { print $2 }')
2474 [ $repaired -eq 0 ] ||
2475 error "(1) Expected 0 repaired, but got $repaired"
2477 echo "Read RPC with right parent FID should be accepted,"
2478 echo "and cause parent FID on OST to be fixed"
2480 do_facet ost1 $LCTL set_param -n \
2481 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2482 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2484 repaired=$(do_facet ost1 $LCTL get_param -n \
2485 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2486 awk '/^repaired/ { print $2 }')
2487 [ $repaired -eq 1 ] ||
2488 error "(3) Expected 1 repaired, but got $repaired"
2490 run_test 19b "OST-object inconsistency self repair"
2493 [ $OSTCOUNT -lt 2 ] &&
2494 skip "The test needs at least 2 OSTs" && return
2497 echo "The target MDT-object and some of its OST-object are lost."
2498 echo "The LFSCK should find out the left OST-objects and re-create"
2499 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2500 echo "with the partial OST-objects (LOV EA hole)."
2502 echo "New client can access the file with LOV EA hole via normal"
2503 echo "system tools or commands without crash the system."
2505 echo "For old client, even though it cannot access the file with"
2506 echo "LOV EA hole, it should not cause the system crash."
2509 check_mount_and_prep
2510 $LFS mkdir -i 0 $DIR/$tdir/a1
2511 if [ $OSTCOUNT -gt 2 ]; then
2512 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2515 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2519 # 256 blocks on the stripe0.
2520 # 1 block on the stripe1 for 2 OSTs case.
2521 # 256 blocks on the stripe1 for other cases.
2522 # 1 block on the stripe2 if OSTs > 2
2523 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2524 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2525 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2527 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2528 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2529 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2532 $LFS getstripe $DIR/$tdir/a1/f0
2534 $LFS getstripe $DIR/$tdir/a1/f1
2536 $LFS getstripe $DIR/$tdir/a1/f2
2538 if [ $OSTCOUNT -gt 2 ]; then
2539 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2540 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2542 $LFS getstripe $DIR/$tdir/a1/f3
2545 cancel_lru_locks osc
2547 echo "Inject failure..."
2548 echo "To simulate f0 lost MDT-object"
2549 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2550 do_facet mds1 $LCTL set_param fail_loc=0x1616
2551 rm -f $DIR/$tdir/a1/f0
2553 echo "To simulate f1 lost MDT-object and OST-object0"
2554 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2555 do_facet mds1 $LCTL set_param fail_loc=0x161a
2556 rm -f $DIR/$tdir/a1/f1
2558 echo "To simulate f2 lost MDT-object and OST-object1"
2559 do_facet mds1 $LCTL set_param fail_val=1
2560 rm -f $DIR/$tdir/a1/f2
2562 if [ $OSTCOUNT -gt 2 ]; then
2563 echo "To simulate f3 lost MDT-object and OST-object2"
2564 do_facet mds1 $LCTL set_param fail_val=2
2565 rm -f $DIR/$tdir/a1/f3
2568 umount_client $MOUNT
2571 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2573 echo "Inject failure to slow down the LFSCK on OST0"
2574 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2575 do_facet ost1 $LCTL set_param fail_loc=0x161b
2577 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2578 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2581 do_facet ost1 $LCTL set_param fail_loc=0
2583 for k in $(seq $MDSCOUNT); do
2584 # The LFSCK status query internal is 30 seconds. For the case
2585 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2586 # time to guarantee the status sync up.
2587 wait_update_facet mds${k} "$LCTL get_param -n \
2588 mdd.$(facet_svc mds${k}).lfsck_layout |
2589 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2590 error "(2) MDS${k} is not the expected 'completed'"
2593 for k in $(seq $OSTCOUNT); do
2594 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2595 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2596 awk '/^status/ { print $2 }')
2597 [ "$cur_status" == "completed" ] ||
2598 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2601 local repaired=$(do_facet mds1 $LCTL get_param -n \
2602 mdd.$(facet_svc mds1).lfsck_layout |
2603 awk '/^repaired_orphan/ { print $2 }')
2604 if [ $OSTCOUNT -gt 2 ]; then
2605 [ $repaired -eq 9 ] ||
2606 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2608 [ $repaired -eq 4 ] ||
2609 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2612 mount_client $MOUNT || error "(5.0) Fail to start client!"
2614 LOV_PATTERN_F_HOLE=0x40000000
2617 # ${fid0}-R-0 is the old f0
2619 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2620 echo "Check $name, which is the old f0"
2622 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2624 local pattern=0x$($LFS getstripe -L $name)
2625 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2626 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2628 local stripes=$($LFS getstripe -c $name)
2629 if [ $OSTCOUNT -gt 2 ]; then
2630 [ $stripes -eq 3 ] ||
2631 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2633 [ $stripes -eq 2 ] ||
2634 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2637 local size=$(stat $name | awk '/Size:/ { print $2 }')
2638 [ $size -eq $((4096 * $bcount)) ] ||
2639 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2641 cat $name > /dev/null || error "(5.5) cannot read $name"
2643 echo "dummy" >> $name || error "(5.6) cannot write $name"
2645 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2647 touch $name || error "(5.8) cannot touch $name"
2649 rm -f $name || error "(5.9) cannot unlink $name"
2652 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2654 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2655 if [ $OSTCOUNT -gt 2 ]; then
2656 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2658 echo "Check $name, it contains the old f1's stripe1"
2661 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2663 pattern=0x$($LFS getstripe -L $name)
2664 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2665 error "(6.2) expect pattern flag hole, but got $pattern"
2667 stripes=$($LFS getstripe -c $name)
2668 if [ $OSTCOUNT -gt 2 ]; then
2669 [ $stripes -eq 3 ] ||
2670 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2672 [ $stripes -eq 2 ] ||
2673 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2676 size=$(stat $name | awk '/Size:/ { print $2 }')
2677 [ $size -eq $((4096 * $bcount)) ] ||
2678 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2680 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2682 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2683 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2686 [ $failures -eq 256 ] ||
2687 error "(6.6) expect 256 IO failures, but get $failures"
2689 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2690 [ $size -eq $((4096 * $bcount)) ] ||
2691 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2693 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2694 error "(6.8) write to the LOV EA hole should fail"
2696 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2697 error "(6.9) write to normal stripe should NOT fail"
2699 echo "foo" >> $name && error "(6.10) append write $name should fail"
2701 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2703 touch $name || error "(6.12) cannot touch $name"
2705 rm -f $name || error "(6.13) cannot unlink $name"
2708 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2710 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2711 if [ $OSTCOUNT -gt 2 ]; then
2712 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2714 echo "Check $name, it contains the old f2's stripe0"
2717 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2719 pattern=0x$($LFS getstripe -L $name)
2720 stripes=$($LFS getstripe -c $name)
2721 size=$(stat $name | awk '/Size:/ { print $2 }')
2722 if [ $OSTCOUNT -gt 2 ]; then
2723 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2724 error "(7.2.1) expect pattern flag hole, but got $pattern"
2726 [ $stripes -eq 3 ] ||
2727 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2729 [ $size -eq $((4096 * $bcount)) ] ||
2730 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2732 cat $name > /dev/null &&
2733 error "(7.5.1) normal read $name should fail"
2735 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2736 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2738 [ $failures -eq 256 ] ||
2739 error "(7.6) expect 256 IO failures, but get $failures"
2741 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2742 [ $size -eq $((4096 * $bcount)) ] ||
2743 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2745 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2746 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2748 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2749 error "(7.8.1) write to normal stripe should NOT fail"
2751 echo "foo" >> $name &&
2752 error "(7.8.3) append write $name should fail"
2754 chown $RUNAS_ID:$RUNAS_GID $name ||
2755 error "(7.9.1) cannot chown on $name"
2757 touch $name || error "(7.10.1) cannot touch $name"
2759 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2760 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2762 [ $stripes -eq 1 ] ||
2763 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2766 [ $size -eq $((4096 * (256 + 0))) ] ||
2767 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2769 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2771 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2773 chown $RUNAS_ID:$RUNAS_GID $name ||
2774 error "(7.9.2) cannot chown on $name"
2776 touch $name || error "(7.10.2) cannot touch $name"
2779 rm -f $name || error "(7.11) cannot unlink $name"
2781 [ $OSTCOUNT -le 2 ] && return
2784 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2786 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2787 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2789 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2791 pattern=0x$($LFS getstripe -L $name)
2792 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2793 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2795 stripes=$($LFS getstripe -c $name)
2796 # LFSCK does not know the old f3 had 3 stripes.
2797 # It only tries to find as much as possible.
2798 # The stripe count depends on the last stripe's offset.
2799 [ $stripes -eq 2 ] ||
2800 error "(8.3) expect the stripe count is 2, but got $stripes"
2802 size=$(stat $name | awk '/Size:/ { print $2 }')
2804 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2805 error "(8.4) expect the size $((4096 * 512)), but got $size"
2807 cat $name > /dev/null || error "(8.5) cannot read $name"
2809 echo "dummy" >> $name || error "(8.6) cannot write $name"
2811 chown $RUNAS_ID:$RUNAS_GID $name ||
2812 error "(8.7) cannot chown on $name"
2814 touch $name || error "(8.8) cannot touch $name"
2816 rm -f $name || error "(8.9) cannot unlink $name"
2818 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2821 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2822 skip "ignore the test if MDS is older than 2.5.59" && return
2824 check_mount_and_prep
2825 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2827 echo "Start all LFSCK components by default (-s 1)"
2828 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2829 error "Fail to start LFSCK"
2831 echo "namespace LFSCK should be in 'scanning-phase1' status"
2832 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2833 [ "$STATUS" == "scanning-phase1" ] ||
2834 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2836 echo "layout LFSCK should be in 'scanning-phase1' status"
2837 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2838 [ "$STATUS" == "scanning-phase1" ] ||
2839 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2841 echo "Stop all LFSCK components by default"
2842 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2843 error "Fail to stop LFSCK"
2845 run_test 21 "run all LFSCK components by default"
2848 [ $MDSCOUNT -lt 2 ] &&
2849 skip "We need at least 2 MDSes for this test" && return
2852 echo "The parent_A references the child directory via some name entry,"
2853 echo "but the child directory back references another parent_B via its"
2854 echo "".." name entry. The parent_B does not exist. Then the namespace"
2855 echo "LFSCK will repair the child directory's ".." name entry."
2858 check_mount_and_prep
2860 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2861 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2863 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2864 echo "The dummy's dotdot name entry references the guard."
2865 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2867 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2868 error "(3) Fail to mkdir on MDT0"
2869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2871 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2873 echo "Trigger namespace LFSCK to repair unmatched pairs"
2874 $START_NAMESPACE -A -r ||
2875 error "(5) Fail to start LFSCK for namespace"
2877 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2878 mdd.${MDT_DEV}.lfsck_namespace |
2879 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2881 error "(6) unexpected status"
2884 local repaired=$($SHOW_NAMESPACE |
2885 awk '/^unmatched_pairs_repaired/ { print $2 }')
2886 [ $repaired -eq 1 ] ||
2887 error "(7) Fail to repair unmatched pairs: $repaired"
2889 echo "'ls' should success after namespace LFSCK repairing"
2890 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2891 error "(8) ls should success."
2893 run_test 22a "LFSCK can repair unmatched pairs (1)"
2896 [ $MDSCOUNT -lt 2 ] &&
2897 skip "We need at least 2 MDSes for this test" && return
2900 echo "The parent_A references the child directory via the name entry_B,"
2901 echo "but the child directory back references another parent_C via its"
2902 echo "".." name entry. The parent_C exists, but there is no the name"
2903 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2904 echo "the child directory's ".." name entry and its linkEA."
2907 check_mount_and_prep
2909 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2910 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2912 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2913 echo "and bad linkEA. The dummy's dotdot name entry references the"
2914 echo "guard. The dummy's linkEA references n non-exist name entry."
2915 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2917 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2918 error "(3) Fail to mkdir on MDT0"
2919 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2921 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2922 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2923 local dummyname=$($LFS fid2path $DIR $dummyfid)
2924 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2925 error "(4) fid2path works unexpectedly."
2927 echo "Trigger namespace LFSCK to repair unmatched pairs"
2928 $START_NAMESPACE -A -r ||
2929 error "(5) Fail to start LFSCK for namespace"
2931 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2932 mdd.${MDT_DEV}.lfsck_namespace |
2933 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2935 error "(6) unexpected status"
2938 local repaired=$($SHOW_NAMESPACE |
2939 awk '/^unmatched_pairs_repaired/ { print $2 }')
2940 [ $repaired -eq 1 ] ||
2941 error "(7) Fail to repair unmatched pairs: $repaired"
2943 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2944 local dummyname=$($LFS fid2path $DIR $dummyfid)
2945 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2946 error "(8) fid2path does not work"
2948 run_test 22b "LFSCK can repair unmatched pairs (2)"
2951 [ $MDSCOUNT -lt 2 ] &&
2952 skip "We need at least 2 MDSes for this test" && return
2955 echo "The name entry is there, but the MDT-object for such name "
2956 echo "entry does not exist. The namespace LFSCK should find out "
2957 echo "and repair the inconsistency as required."
2960 check_mount_and_prep
2962 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2963 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2965 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2966 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2967 do_facet mds2 $LCTL set_param fail_loc=0x1620
2968 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2969 do_facet mds2 $LCTL set_param fail_loc=0
2971 echo "'ls' should fail because of dangling name entry"
2972 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2974 echo "Trigger namespace LFSCK to find out dangling name entry"
2975 $START_NAMESPACE -A -r ||
2976 error "(5) Fail to start LFSCK for namespace"
2978 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2979 mdd.${MDT_DEV}.lfsck_namespace |
2980 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2982 error "(6) unexpected status"
2985 local repaired=$($SHOW_NAMESPACE |
2986 awk '/^dangling_repaired/ { print $2 }')
2987 [ $repaired -eq 1 ] ||
2988 error "(7) Fail to repair dangling name entry: $repaired"
2990 echo "'ls' should fail because not re-create MDT-object by default"
2991 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2993 echo "Trigger namespace LFSCK again to repair dangling name entry"
2994 $START_NAMESPACE -A -r -C ||
2995 error "(9) Fail to start LFSCK for namespace"
2997 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2998 mdd.${MDT_DEV}.lfsck_namespace |
2999 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3001 error "(10) unexpected status"
3004 repaired=$($SHOW_NAMESPACE |
3005 awk '/^dangling_repaired/ { print $2 }')
3006 [ $repaired -eq 1 ] ||
3007 error "(11) Fail to repair dangling name entry: $repaired"
3009 echo "'ls' should success after namespace LFSCK repairing"
3010 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3012 run_test 23a "LFSCK can repair dangling name entry (1)"
3016 echo "The objectA has multiple hard links, one of them corresponding"
3017 echo "to the name entry_B. But there is something wrong for the name"
3018 echo "entry_B and cause entry_B to references non-exist object_C."
3019 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3020 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3021 echo "comes to the second-stage scanning, it will find that the"
3022 echo "former re-creating object_C is not proper, and will try to"
3023 echo "replace the object_C with the real object_A."
3026 check_mount_and_prep
3028 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3029 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3030 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3032 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3033 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3035 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3036 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3038 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3040 echo "'ls' should fail because of dangling name entry"
3041 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3042 error "(6) ls should fail."
3044 echo "Trigger namespace LFSCK to find out dangling name entry"
3045 $START_NAMESPACE -r -C ||
3046 error "(7) Fail to start LFSCK for namespace"
3048 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3049 mdd.${MDT_DEV}.lfsck_namespace |
3050 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3052 error "(8) unexpected status"
3055 local repaired=$($SHOW_NAMESPACE |
3056 awk '/^dangling_repaired/ { print $2 }')
3057 [ $repaired -eq 1 ] ||
3058 error "(9) Fail to repair dangling name entry: $repaired"
3060 repaired=$($SHOW_NAMESPACE |
3061 awk '/^multiple_linked_repaired/ { print $2 }')
3062 [ $repaired -eq 1 ] ||
3063 error "(10) Fail to drop the former created object: $repaired"
3065 local data=$(cat $DIR/$tdir/d0/foo)
3066 [ "$data" == "dummy" ] ||
3067 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3069 run_test 23b "LFSCK can repair dangling name entry (2)"
3073 echo "The objectA has multiple hard links, one of them corresponding"
3074 echo "to the name entry_B. But there is something wrong for the name"
3075 echo "entry_B and cause entry_B to references non-exist object_C."
3076 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3077 echo "as dangling, and re-create the lost object_C. And then others"
3078 echo "modified the re-created object_C. When the LFSCK comes to the"
3079 echo "second-stage scanning, it will find that the former re-creating"
3080 echo "object_C maybe wrong and try to replace the object_C with the"
3081 echo "real object_A. But because object_C has been modified, so the"
3082 echo "LFSCK cannot replace it."
3085 check_mount_and_prep
3087 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3088 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3089 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3091 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3092 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3094 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3095 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3097 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3099 echo "'ls' should fail because of dangling name entry"
3100 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3101 error "(6) ls should fail."
3103 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3104 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3106 echo "Trigger namespace LFSCK to find out dangling name entry"
3107 $START_NAMESPACE -r -C ||
3108 error "(7) Fail to start LFSCK for namespace"
3110 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3111 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3112 stat $DIR/$tdir/guard
3114 error "(8) unexpected size"
3117 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3118 cancel_lru_locks osc
3120 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3121 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3122 mdd.${MDT_DEV}.lfsck_namespace |
3123 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3125 error "(10) unexpected status"
3128 local repaired=$($SHOW_NAMESPACE |
3129 awk '/^dangling_repaired/ { print $2 }')
3130 [ $repaired -eq 1 ] ||
3131 error "(11) Fail to repair dangling name entry: $repaired"
3133 local data=$(cat $DIR/$tdir/d0/foo)
3134 [ "$data" != "dummy" ] ||
3135 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3137 run_test 23c "LFSCK can repair dangling name entry (3)"
3140 [ $MDSCOUNT -lt 2 ] &&
3141 skip "We need at least 2 MDSes for this test" && return
3144 echo "Two MDT-objects back reference the same name entry via their"
3145 echo "each own linkEA entry, but the name entry only references one"
3146 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3147 echo "for the MDT-object that is not recognized. If such MDT-object"
3148 echo "has no other linkEA entry after the removing, then the LFSCK"
3149 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3152 check_mount_and_prep
3154 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3156 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3157 $LFS path2fid $DIR/$tdir/d0/guard
3159 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3160 $LFS path2fid $DIR/$tdir/d0/dummy
3163 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3164 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3166 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3169 touch $DIR/$tdir/d0/guard/foo ||
3170 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3172 echo "Inject failure stub on MDT0 to simulate the case that"
3173 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3174 echo "that references $DIR/$tdir/d0/guard/foo."
3175 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3176 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3177 echo "there with the same linkEA entry as another MDT-object"
3178 echo "$DIR/$tdir/d0/guard/foo has"
3180 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3181 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3182 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3183 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3184 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3185 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3186 rmdir $DIR/$tdir/d0/dummy/foo ||
3187 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3190 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3191 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3192 error "(6) stat successfully unexpectedly"
3194 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3195 $START_NAMESPACE -A -r ||
3196 error "(7) Fail to start LFSCK for namespace"
3198 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3199 mdd.${MDT_DEV}.lfsck_namespace |
3200 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3202 error "(8) unexpected status"
3205 local repaired=$($SHOW_NAMESPACE |
3206 awk '/^multiple_referenced_repaired/ { print $2 }')
3207 [ $repaired -eq 1 ] ||
3208 error "(9) Fail to repair multiple referenced name entry: $repaired"
3210 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3211 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3212 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3214 local cname="$cfid-$pfid-D-0"
3215 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3216 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3218 run_test 24 "LFSCK can repair multiple-referenced name entry"
3221 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3222 skip "Only support to inject failure on ldiskfs" && return
3225 echo "The file type in the name entry does not match the file type"
3226 echo "claimed by the referenced object. Then the LFSCK will update"
3227 echo "the file type in the name entry."
3230 check_mount_and_prep
3232 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3234 echo "Inject failure stub on MDT0 to simulate the case that"
3235 echo "the file type stored in the name entry is wrong."
3237 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3239 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3242 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3243 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3245 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3246 mdd.${MDT_DEV}.lfsck_namespace |
3247 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3249 error "(4) unexpected status"
3252 local repaired=$($SHOW_NAMESPACE |
3253 awk '/^bad_file_type_repaired/ { print $2 }')
3254 [ $repaired -eq 1 ] ||
3255 error "(5) Fail to repair bad file type in name entry: $repaired"
3257 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3259 run_test 25 "LFSCK can repair bad file type in the name entry"
3263 echo "The local name entry back referenced by the MDT-object is lost."
3264 echo "The namespace LFSCK will add the missing local name entry back"
3265 echo "to the normal namespace."
3268 check_mount_and_prep
3270 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3271 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3272 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3274 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3275 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3277 echo "Inject failure stub on MDT0 to simulate the case that"
3278 echo "foo's name entry will be removed, but the foo's object"
3279 echo "and its linkEA are kept in the system."
3281 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3283 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3286 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3288 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3289 $START_NAMESPACE -r -A ||
3290 error "(6) Fail to start LFSCK for namespace"
3292 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3293 mdd.${MDT_DEV}.lfsck_namespace |
3294 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3296 error "(7) unexpected status"
3299 local repaired=$($SHOW_NAMESPACE |
3300 awk '/^lost_dirent_repaired/ { print $2 }')
3301 [ $repaired -eq 1 ] ||
3302 error "(8) Fail to repair lost dirent: $repaired"
3304 ls -ail $DIR/$tdir/d0/foo ||
3305 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3307 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3308 [ "$foofid" == "$foofid2" ] ||
3309 error "(10) foo's FID changed: $foofid, $foofid2"
3311 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3314 [ $MDSCOUNT -lt 2 ] &&
3315 skip "We need at least 2 MDSes for this test" && return
3318 echo "The remote name entry back referenced by the MDT-object is lost."
3319 echo "The namespace LFSCK will add the missing remote name entry back"
3320 echo "to the normal namespace."
3323 check_mount_and_prep
3325 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3326 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3327 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3329 echo "Inject failure stub on MDT0 to simulate the case that"
3330 echo "foo's name entry will be removed, but the foo's object"
3331 echo "and its linkEA are kept in the system."
3333 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3335 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3338 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3340 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3341 $START_NAMESPACE -r -A ||
3342 error "(5) Fail to start LFSCK for namespace"
3344 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3345 mdd.${MDT_DEV}.lfsck_namespace |
3346 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3348 error "(6) unexpected status"
3351 local repaired=$($SHOW_NAMESPACE |
3352 awk '/^lost_dirent_repaired/ { print $2 }')
3353 [ $repaired -eq 1 ] ||
3354 error "(7) Fail to repair lost dirent: $repaired"
3356 ls -ail $DIR/$tdir/d0/foo ||
3357 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3359 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3360 [ "$foofid" == "$foofid2" ] ||
3361 error "(9) foo's FID changed: $foofid, $foofid2"
3363 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3367 echo "The local parent referenced by the MDT-object linkEA is lost."
3368 echo "The namespace LFSCK will re-create the lost parent as orphan."
3371 check_mount_and_prep
3373 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3374 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3375 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3376 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3378 echo "Inject failure stub on MDT0 to simulate the case that"
3379 echo "foo's name entry will be removed, but the foo's object"
3380 echo "and its linkEA are kept in the system. And then remove"
3381 echo "another hard link and the parent directory."
3383 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3384 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3385 rm -f $DIR/$tdir/d0/foo ||
3386 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3387 rm -f $DIR/$tdir/d0/dummy ||
3388 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3391 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3392 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3394 echo "Trigger namespace LFSCK to repair the lost parent"
3395 $START_NAMESPACE -r -A ||
3396 error "(6) Fail to start LFSCK for namespace"
3398 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3399 mdd.${MDT_DEV}.lfsck_namespace |
3400 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3402 error "(7) unexpected status"
3405 local repaired=$($SHOW_NAMESPACE |
3406 awk '/^lost_dirent_repaired/ { print $2 }')
3407 [ $repaired -eq 1 ] ||
3408 error "(8) Fail to repair lost dirent: $repaired"
3410 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3411 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3412 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3414 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3416 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3417 [ ! -z "$cname" ] ||
3418 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3420 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3423 [ $MDSCOUNT -lt 2 ] &&
3424 skip "We need at least 2 MDSes for this test" && return
3427 echo "The remote parent referenced by the MDT-object linkEA is lost."
3428 echo "The namespace LFSCK will re-create the lost parent as orphan."
3431 check_mount_and_prep
3433 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3434 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3436 $LFS path2fid $DIR/$tdir/d0
3438 echo "Inject failure stub on MDT0 to simulate the case that"
3439 echo "foo's name entry will be removed, but the foo's object"
3440 echo "and its linkEA are kept in the system. And then remove"
3441 echo "the parent directory."
3443 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3445 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3446 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3448 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3449 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3451 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3452 $START_NAMESPACE -r -A ||
3453 error "(6) Fail to start LFSCK for namespace"
3455 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3456 mdd.${MDT_DEV}.lfsck_namespace |
3457 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3459 error "(7) unexpected status"
3462 local repaired=$($SHOW_NAMESPACE |
3463 awk '/^lost_dirent_repaired/ { print $2 }')
3464 [ $repaired -eq 1 ] ||
3465 error "(8) Fail to repair lost dirent: $repaired"
3467 ls -ail $MOUNT/.lustre/lost+found/
3469 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3470 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3471 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3473 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3475 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3476 [ ! -z "$cname" ] ||
3477 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3479 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3482 [ $MDSCOUNT -lt 2 ] &&
3483 skip "The test needs at least 2 MDTs" && return
3486 echo "The target name entry is lost. The LFSCK should insert the"
3487 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3488 echo "the MDT (on which the orphan MDT-object resides) has ever"
3489 echo "failed to respond some name entry verification during the"
3490 echo "first stage-scanning, then the LFSCK should skip to handle"
3491 echo "orphan MDT-object on this MDT. But other MDTs should not"
3495 check_mount_and_prep
3496 $LFS mkdir -i 0 $DIR/$tdir/d1
3497 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3498 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3500 $LFS mkdir -i 1 $DIR/$tdir/d2
3501 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3502 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3504 echo "Inject failure stub on MDT0 to simulate the case that"
3505 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3506 echo "and its linkEA are kept in the system. And the case that"
3507 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3508 echo "and its linkEA are kept in the system."
3510 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3511 do_facet mds1 $LCTL set_param fail_loc=0x1624
3512 do_facet mds2 $LCTL set_param fail_loc=0x1624
3513 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3514 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3515 do_facet mds1 $LCTL set_param fail_loc=0
3516 do_facet mds2 $LCTL set_param fail_loc=0
3518 cancel_lru_locks mdc
3519 cancel_lru_locks osc
3521 echo "Inject failure, to simulate the MDT0 fail to handle"
3522 echo "MDT1 LFSCK request during the first-stage scanning."
3523 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3524 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3526 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3527 $START_NAMESPACE -r -A ||
3528 error "(3) Fail to start LFSCK for namespace"
3530 wait_update_facet mds1 "$LCTL get_param -n \
3531 mdd.$(facet_svc mds1).lfsck_namespace |
3532 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3533 error "(4) mds1 is not the expected 'partial'"
3536 wait_update_facet mds2 "$LCTL get_param -n \
3537 mdd.$(facet_svc mds2).lfsck_namespace |
3538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3539 error "(5) mds2 is not the expected 'completed'"
3542 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3544 local repaired=$(do_facet mds1 $LCTL get_param -n \
3545 mdd.$(facet_svc mds1).lfsck_namespace |
3546 awk '/^lost_dirent_repaired/ { print $2 }')
3547 [ $repaired -eq 0 ] ||
3548 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3550 repaired=$(do_facet mds2 $LCTL get_param -n \
3551 mdd.$(facet_svc mds2).lfsck_namespace |
3552 awk '/^lost_dirent_repaired/ { print $2 }')
3553 [ $repaired -eq 1 ] ||
3554 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3556 echo "Trigger namespace LFSCK on all devices again to cleanup"
3557 $START_NAMESPACE -r -A ||
3558 error "(8) Fail to start LFSCK for namespace"
3560 for k in $(seq $MDSCOUNT); do
3561 # The LFSCK status query internal is 30 seconds. For the case
3562 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3563 # time to guarantee the status sync up.
3564 wait_update_facet mds${k} "$LCTL get_param -n \
3565 mdd.$(facet_svc mds${k}).lfsck_namespace |
3566 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3567 error "(9) MDS${k} is not the expected 'completed'"
3570 local repaired=$(do_facet mds1 $LCTL get_param -n \
3571 mdd.$(facet_svc mds1).lfsck_namespace |
3572 awk '/^lost_dirent_repaired/ { print $2 }')
3573 [ $repaired -eq 1 ] ||
3574 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3576 repaired=$(do_facet mds2 $LCTL get_param -n \
3577 mdd.$(facet_svc mds2).lfsck_namespace |
3578 awk '/^lost_dirent_repaired/ { print $2 }')
3579 [ $repaired -eq 0 ] ||
3580 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3582 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3586 echo "The object's nlink attribute is larger than the object's known"
3587 echo "name entries count. The LFSCK will repair the object's nlink"
3588 echo "attribute to match the known name entries count"
3591 check_mount_and_prep
3593 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3594 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3596 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3597 echo "nlink attribute is larger than its name entries count."
3599 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3601 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3602 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3605 cancel_lru_locks mdc
3606 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3607 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3609 echo "Trigger namespace LFSCK to repair the nlink count"
3610 $START_NAMESPACE -r -A ||
3611 error "(5) Fail to start LFSCK for namespace"
3613 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3614 mdd.${MDT_DEV}.lfsck_namespace |
3615 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3617 error "(6) unexpected status"
3620 local repaired=$($SHOW_NAMESPACE |
3621 awk '/^nlinks_repaired/ { print $2 }')
3622 [ $repaired -eq 1 ] ||
3623 error "(7) Fail to repair nlink count: $repaired"
3625 cancel_lru_locks mdc
3626 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3627 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3629 run_test 29a "LFSCK can repair bad nlink count (1)"
3633 echo "The object's nlink attribute is smaller than the object's known"
3634 echo "name entries count. The LFSCK will repair the object's nlink"
3635 echo "attribute to match the known name entries count"
3638 check_mount_and_prep
3640 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3641 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3643 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3644 echo "nlink attribute is smaller than its name entries count."
3646 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3647 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3648 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3649 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3650 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3652 cancel_lru_locks mdc
3653 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3654 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3656 echo "Trigger namespace LFSCK to repair the nlink count"
3657 $START_NAMESPACE -r -A ||
3658 error "(5) Fail to start LFSCK for namespace"
3660 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3661 mdd.${MDT_DEV}.lfsck_namespace |
3662 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3664 error "(6) unexpected status"
3667 local repaired=$($SHOW_NAMESPACE |
3668 awk '/^nlinks_repaired/ { print $2 }')
3669 [ $repaired -eq 1 ] ||
3670 error "(7) Fail to repair nlink count: $repaired"
3672 cancel_lru_locks mdc
3673 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3674 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3676 run_test 29b "LFSCK can repair bad nlink count (2)"
3680 echo "There are too many hard links to the object, and exceeds the"
3681 echo "object's linkEA limitation, as to NOT all the known name entries"
3682 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3683 echo "skip the nlink verification for this object."
3686 check_mount_and_prep
3688 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3689 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3690 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3691 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3693 echo "Inject failure stub on MDT0 to simulate the case that"
3694 echo "foo's hard links exceed the object's linkEA limitation."
3696 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3698 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3699 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3701 cancel_lru_locks mdc
3703 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3704 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3706 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3707 $LFS fid2path $DIR $foofid
3708 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3709 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3711 echo "Trigger namespace LFSCK to repair the nlink count"
3712 $START_NAMESPACE -r -A ||
3713 error "(7) Fail to start LFSCK for namespace"
3715 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3716 mdd.${MDT_DEV}.lfsck_namespace |
3717 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3719 error "(8) unexpected status"
3722 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3723 local repaired=$($SHOW_NAMESPACE |
3724 awk '/^nlinks_repaired/ { print $2 }')
3725 [ $repaired -eq 0 ] ||
3726 error "(9) Repair nlink count unexpcetedly: $repaired"
3728 cancel_lru_locks mdc
3730 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3731 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3733 count2=$($LFS fid2path $DIR $foofid | wc -l)
3734 [ $count2 -eq 2 ] ||
3735 error "(11) Repaired something unexpectedly: $count2"
3737 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3740 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3741 skip "Only support backend /lost+found for ldiskfs" && return
3744 echo "The namespace LFSCK will move the orphans from backend"
3745 echo "/lost+found directory to normal client visible namespace"
3746 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3749 check_mount_and_prep
3751 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3752 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3754 echo "Inject failure stub on MDT0 to simulate the case that"
3755 echo "directory d0 has no linkEA entry, then the LFSCK will"
3756 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3758 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3759 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3760 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3763 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3764 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3766 echo "Inject failure stub on MDT0 to simulate the case that the"
3767 echo "object's name entry will be removed, but not destroy the"
3768 echo "object. Then backend e2fsck will handle it as orphan and"
3769 echo "add them into the backend /lost+found directory."
3771 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3772 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3773 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3774 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3775 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3776 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3777 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3779 umount_client $MOUNT || error "(10) Fail to stop client!"
3781 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3784 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3785 error "(12) Fail to run e2fsck"
3787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3788 error "(13) Fail to start MDT0"
3790 echo "Trigger namespace LFSCK to recover backend orphans"
3791 $START_NAMESPACE -r -A ||
3792 error "(14) Fail to start LFSCK for namespace"
3794 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3795 mdd.${MDT_DEV}.lfsck_namespace |
3796 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3798 error "(15) unexpected status"
3801 local repaired=$($SHOW_NAMESPACE |
3802 awk '/^local_lost_found_moved/ { print $2 }')
3803 [ $repaired -ge 4 ] ||
3804 error "(16) Fail to recover backend orphans: $repaired"
3806 mount_client $MOUNT || error "(17) Fail to start client!"
3808 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3810 ls -ail $MOUNT/.lustre/lost+found/
3812 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3813 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3814 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3816 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3818 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3819 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3821 stat ${cname}/d1 || error "(21) d0 is not recovered"
3822 stat ${cname}/f1 || error "(22) f1 is not recovered"
3824 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3827 [ $MDSCOUNT -lt 2 ] &&
3828 skip "The test needs at least 2 MDTs" && return
3831 echo "For the name entry under a striped directory, if the name"
3832 echo "hash does not match the shard, then the LFSCK will repair"
3833 echo "the bad name entry"
3836 check_mount_and_prep
3838 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3839 error "(1) Fail to create striped directory"
3841 echo "Inject failure stub on client to simulate the case that"
3842 echo "some name entry should be inserted into other non-first"
3843 echo "shard, but inserted into the first shard by wrong"
3845 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3846 $LCTL set_param fail_loc=0x1628 fail_val=0
3847 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3848 error "(2) Fail to create file under striped directory"
3849 $LCTL set_param fail_loc=0 fail_val=0
3851 echo "Trigger namespace LFSCK to repair bad name hash"
3852 $START_NAMESPACE -r -A ||
3853 error "(3) Fail to start LFSCK for namespace"
3855 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3856 mdd.${MDT_DEV}.lfsck_namespace |
3857 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3859 error "(4) unexpected status"
3862 local repaired=$($SHOW_NAMESPACE |
3863 awk '/^name_hash_repaired/ { print $2 }')
3864 [ $repaired -ge 1 ] ||
3865 error "(5) Fail to repair bad name hash: $repaired"
3867 umount_client $MOUNT || error "(6) umount failed"
3868 mount_client $MOUNT || error "(7) mount failed"
3870 for ((i = 0; i < $MDSCOUNT; i++)); do
3871 stat $DIR/$tdir/striped_dir/d$i ||
3872 error "(8) Fail to stat d$i after LFSCK"
3873 rmdir $DIR/$tdir/striped_dir/d$i ||
3874 error "(9) Fail to unlink d$i after LFSCK"
3877 rmdir $DIR/$tdir/striped_dir ||
3878 error "(10) Fail to remove the striped directory after LFSCK"
3880 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3883 [ $MDSCOUNT -lt 2 ] &&
3884 skip "The test needs at least 2 MDTs" && return
3887 echo "For the name entry under a striped directory, if the name"
3888 echo "hash does not match the shard, then the LFSCK will repair"
3889 echo "the bad name entry"
3892 check_mount_and_prep
3894 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3895 error "(1) Fail to create striped directory"
3897 echo "Inject failure stub on client to simulate the case that"
3898 echo "some name entry should be inserted into other non-second"
3899 echo "shard, but inserted into the secod shard by wrong"
3901 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3902 $LCTL set_param fail_loc=0x1628 fail_val=1
3903 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3904 error "(2) Fail to create file under striped directory"
3905 $LCTL set_param fail_loc=0 fail_val=0
3907 echo "Trigger namespace LFSCK to repair bad name hash"
3908 $START_NAMESPACE -r -A ||
3909 error "(3) Fail to start LFSCK for namespace"
3911 wait_update_facet mds2 "$LCTL get_param -n \
3912 mdd.$(facet_svc mds2).lfsck_namespace |
3913 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3914 error "(4) unexpected status"
3916 local repaired=$(do_facet mds2 $LCTL get_param -n \
3917 mdd.$(facet_svc mds2).lfsck_namespace |
3918 awk '/^name_hash_repaired/ { print $2 }')
3919 [ $repaired -ge 1 ] ||
3920 error "(5) Fail to repair bad name hash: $repaired"
3922 umount_client $MOUNT || error "(6) umount failed"
3923 mount_client $MOUNT || error "(7) mount failed"
3925 for ((i = 0; i < $MDSCOUNT; i++)); do
3926 stat $DIR/$tdir/striped_dir/d$i ||
3927 error "(8) Fail to stat d$i after LFSCK"
3928 rmdir $DIR/$tdir/striped_dir/d$i ||
3929 error "(9) Fail to unlink d$i after LFSCK"
3932 rmdir $DIR/$tdir/striped_dir ||
3933 error "(10) Fail to remove the striped directory after LFSCK"
3935 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3938 [ $MDSCOUNT -lt 2 ] &&
3939 skip "The test needs at least 2 MDTs" && return
3942 echo "For some reason, the master MDT-object of the striped directory"
3943 echo "may lost its master LMV EA. If nobody created files under the"
3944 echo "master directly after the master LMV EA lost, then the LFSCK"
3945 echo "should re-generate the master LMV EA."
3948 check_mount_and_prep
3950 echo "Inject failure stub on MDT0 to simulate the case that the"
3951 echo "master MDT-object of the striped directory lost the LMV EA."
3953 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3954 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3955 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3956 error "(1) Fail to create striped directory"
3957 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3959 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3960 $START_NAMESPACE -r -A ||
3961 error "(2) Fail to start LFSCK for namespace"
3963 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3964 mdd.${MDT_DEV}.lfsck_namespace |
3965 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3967 error "(3) unexpected status"
3970 local repaired=$($SHOW_NAMESPACE |
3971 awk '/^striped_dirs_repaired/ { print $2 }')
3972 [ $repaired -eq 1 ] ||
3973 error "(4) Fail to re-generate master LMV EA: $repaired"
3975 umount_client $MOUNT || error "(5) umount failed"
3976 mount_client $MOUNT || error "(6) mount failed"
3978 local empty=$(ls $DIR/$tdir/striped_dir/)
3979 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3981 rmdir $DIR/$tdir/striped_dir ||
3982 error "(8) Fail to remove the striped directory after LFSCK"
3984 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3987 [ $MDSCOUNT -lt 2 ] &&
3988 skip "The test needs at least 2 MDTs" && return
3991 echo "For some reason, the master MDT-object of the striped directory"
3992 echo "may lost its master LMV EA. If somebody created files under the"
3993 echo "master directly after the master LMV EA lost, then the LFSCK"
3994 echo "should NOT re-generate the master LMV EA, instead, it should"
3995 echo "change the broken striped dirctory as read-only to prevent"
3996 echo "further damage"
3999 check_mount_and_prep
4001 echo "Inject failure stub on MDT0 to simulate the case that the"
4002 echo "master MDT-object of the striped directory lost the LMV EA."
4004 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4006 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4007 error "(1) Fail to create striped directory"
4008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4010 umount_client $MOUNT || error "(2) umount failed"
4011 mount_client $MOUNT || error "(3) mount failed"
4013 touch $DIR/$tdir/striped_dir/dummy ||
4014 error "(4) Fail to touch under broken striped directory"
4016 echo "Trigger namespace LFSCK to find out the inconsistency"
4017 $START_NAMESPACE -r -A ||
4018 error "(5) Fail to start LFSCK for namespace"
4020 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4021 mdd.${MDT_DEV}.lfsck_namespace |
4022 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4024 error "(6) unexpected status"
4027 local repaired=$($SHOW_NAMESPACE |
4028 awk '/^striped_dirs_repaired/ { print $2 }')
4029 [ $repaired -eq 0 ] ||
4030 error "(7) Re-generate master LMV EA unexpected: $repaired"
4032 stat $DIR/$tdir/striped_dir/dummy ||
4033 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4035 touch $DIR/$tdir/striped_dir/foo &&
4036 error "(9) The broken striped directory should be read-only"
4038 chattr -i $DIR/$tdir/striped_dir ||
4039 error "(10) Fail to chattr on the broken striped directory"
4041 rmdir $DIR/$tdir/striped_dir ||
4042 error "(11) Fail to remove the striped directory after LFSCK"
4044 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4047 [ $MDSCOUNT -lt 2 ] &&
4048 skip "The test needs at least 2 MDTs" && return
4051 echo "For some reason, the slave MDT-object of the striped directory"
4052 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4053 echo "slave LMV EA."
4056 check_mount_and_prep
4058 echo "Inject failure stub on MDT0 to simulate the case that the"
4059 echo "slave MDT-object (that resides on the same MDT as the master"
4060 echo "MDT-object resides on) lost the LMV EA."
4062 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4063 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4064 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4065 error "(1) Fail to create striped directory"
4066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4068 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4069 $START_NAMESPACE -r -A ||
4070 error "(2) Fail to start LFSCK for namespace"
4072 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4073 mdd.${MDT_DEV}.lfsck_namespace |
4074 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4076 error "(3) unexpected status"
4079 local repaired=$($SHOW_NAMESPACE |
4080 awk '/^striped_shards_repaired/ { print $2 }')
4081 [ $repaired -eq 1 ] ||
4082 error "(4) Fail to re-generate slave LMV EA: $repaired"
4084 rmdir $DIR/$tdir/striped_dir ||
4085 error "(5) Fail to remove the striped directory after LFSCK"
4087 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4090 [ $MDSCOUNT -lt 2 ] &&
4091 skip "The test needs at least 2 MDTs" && return
4094 echo "For some reason, the slave MDT-object of the striped directory"
4095 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4096 echo "slave LMV EA."
4099 check_mount_and_prep
4101 echo "Inject failure stub on MDT0 to simulate the case that the"
4102 echo "slave MDT-object (that resides on differnt MDT as the master"
4103 echo "MDT-object resides on) lost the LMV EA."
4105 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4107 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4108 error "(1) Fail to create striped directory"
4109 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4111 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4112 $START_NAMESPACE -r -A ||
4113 error "(2) Fail to start LFSCK for namespace"
4115 wait_update_facet mds2 "$LCTL get_param -n \
4116 mdd.$(facet_svc mds2).lfsck_namespace |
4117 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4118 error "(3) unexpected status"
4120 local repaired=$(do_facet mds2 $LCTL get_param -n \
4121 mdd.$(facet_svc mds2).lfsck_namespace |
4122 awk '/^striped_shards_repaired/ { print $2 }')
4123 [ $repaired -eq 1 ] ||
4124 error "(4) Fail to re-generate slave LMV EA: $repaired"
4126 rmdir $DIR/$tdir/striped_dir ||
4127 error "(5) Fail to remove the striped directory after LFSCK"
4129 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4132 [ $MDSCOUNT -lt 2 ] &&
4133 skip "The test needs at least 2 MDTs" && return
4136 echo "For some reason, the stripe index in the slave LMV EA is"
4137 echo "corrupted. The LFSCK should repair the slave LMV EA."
4140 check_mount_and_prep
4142 echo "Inject failure stub on MDT0 to simulate the case that the"
4143 echo "slave LMV EA on the first shard of the striped directory"
4144 echo "claims the same index as the second shard claims"
4146 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4148 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4149 error "(1) Fail to create striped directory"
4150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4152 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4153 $START_NAMESPACE -r -A ||
4154 error "(2) Fail to start LFSCK for namespace"
4156 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4157 mdd.${MDT_DEV}.lfsck_namespace |
4158 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4160 error "(3) unexpected status"
4163 local repaired=$($SHOW_NAMESPACE |
4164 awk '/^striped_shards_repaired/ { print $2 }')
4165 [ $repaired -eq 1 ] ||
4166 error "(4) Fail to repair slave LMV EA: $repaired"
4168 umount_client $MOUNT || error "(5) umount failed"
4169 mount_client $MOUNT || error "(6) mount failed"
4171 touch $DIR/$tdir/striped_dir/foo ||
4172 error "(7) Fail to touch file after the LFSCK"
4174 rm -f $DIR/$tdir/striped_dir/foo ||
4175 error "(8) Fail to unlink file after the LFSCK"
4177 rmdir $DIR/$tdir/striped_dir ||
4178 error "(9) Fail to remove the striped directory after LFSCK"
4180 run_test 31g "Repair the corrupted slave LMV EA"
4183 [ $MDSCOUNT -lt 2 ] &&
4184 skip "The test needs at least 2 MDTs" && return
4187 echo "For some reason, the shard's name entry in the striped"
4188 echo "directory may be corrupted. The LFSCK should repair the"
4189 echo "bad shard's name entry."
4192 check_mount_and_prep
4194 echo "Inject failure stub on MDT0 to simulate the case that the"
4195 echo "first shard's name entry in the striped directory claims"
4196 echo "the same index as the second shard's name entry claims."
4198 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4199 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4200 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4201 error "(1) Fail to create striped directory"
4202 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4204 echo "Trigger namespace LFSCK to repair the shard's name entry"
4205 $START_NAMESPACE -r -A ||
4206 error "(2) Fail to start LFSCK for namespace"
4208 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4209 mdd.${MDT_DEV}.lfsck_namespace |
4210 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4212 error "(3) unexpected status"
4215 local repaired=$($SHOW_NAMESPACE |
4216 awk '/^dirent_repaired/ { print $2 }')
4217 [ $repaired -eq 1 ] ||
4218 error "(4) Fail to repair shard's name entry: $repaired"
4220 umount_client $MOUNT || error "(5) umount failed"
4221 mount_client $MOUNT || error "(6) mount failed"
4223 touch $DIR/$tdir/striped_dir/foo ||
4224 error "(7) Fail to touch file after the LFSCK"
4226 rm -f $DIR/$tdir/striped_dir/foo ||
4227 error "(8) Fail to unlink file after the LFSCK"
4229 rmdir $DIR/$tdir/striped_dir ||
4230 error "(9) Fail to remove the striped directory after LFSCK"
4232 run_test 31h "Repair the corrupted shard's name entry"
4234 # restore MDS/OST size
4235 MDSSIZE=${SAVED_MDSSIZE}
4236 OSTSIZE=${SAVED_OSTSIZE}
4237 OSTCOUNT=${SAVED_OSTCOUNT}
4239 # cleanup the system at last