3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 SAVED_MDSSIZE=${MDSSIZE}
28 SAVED_OSTSIZE=${OSTSIZE}
29 SAVED_OSTCOUNT=${OSTCOUNT}
30 # use small MDS + OST size to speed formatting time
31 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
34 # no need too many OSTs, to reduce the format/start/stop overhead
35 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
37 # build up a clean test environment.
41 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
42 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
45 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
48 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
51 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
54 # DNE does not support striped directory on zfs-based backend yet.
55 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
56 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
60 MDT_DEV="${FSNAME}-MDT0000"
61 OST_DEV="${FSNAME}-OST0000"
62 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
63 START_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
65 START_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
67 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
68 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
69 SHOW_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
71 SHOW_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
73 SHOW_LAYOUT_ON_OST="do_facet ost1 \
74 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
75 MOUNT_OPTS_SCRUB="-o user_xattr"
76 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
85 echo "preparing... $nfiles * $ndirs files will be created $(date)."
86 if [ ! -z $igif ]; then
87 #define OBD_FAIL_FID_IGIF 0x1504
88 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
91 cp $LUSTRE/tests/*.sh $DIR/$tdir/
92 if [ $ndirs -gt 0 ]; then
93 createmany -d $DIR/$tdir/d $ndirs
94 createmany -m $DIR/$tdir/f $ndirs
95 if [ $nfiles -gt 0 ]; then
96 for ((i = 0; i < $ndirs; i++)); do
97 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
98 /dev/null || error "createmany $nfiles"
101 createmany -d $DIR/$tdir/e $ndirs
104 if [ ! -z $igif ]; then
105 touch $DIR/$tdir/dummy
106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
109 echo "prepared $(date)."
115 #define OBD_FAIL_LFSCK_DELAY1 0x1600
116 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
117 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
119 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
121 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
122 [ "$STATUS" == "scanning-phase1" ] ||
123 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
125 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
127 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
128 [ "$STATUS" == "stopped" ] ||
129 error "(6) Expect 'stopped', but got '$STATUS'"
131 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
133 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
134 [ "$STATUS" == "scanning-phase1" ] ||
135 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
138 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
139 mdd.${MDT_DEV}.lfsck_namespace |
140 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
142 error "(9) unexpected status"
145 local repaired=$($SHOW_NAMESPACE |
146 awk '/^updated_phase1/ { print $2 }')
147 [ $repaired -eq 0 ] ||
148 error "(10) Expect nothing to be repaired, but got: $repaired"
150 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
151 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
152 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
153 mdd.${MDT_DEV}.lfsck_namespace |
154 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
156 error "(12) unexpected status"
159 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
160 [ $((scanned1 + 1)) -eq $scanned2 ] ||
161 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
163 echo "stopall, should NOT crash LU-3649"
164 stopall || error "(14) Fail to stopall"
166 run_test 0 "Control LFSCK manually"
169 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
170 skip "OI Scrub not implemented for ZFS" && return
174 #define OBD_FAIL_FID_INDIR 0x1501
175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
176 touch $DIR/$tdir/dummy
178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
180 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
181 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
182 mdd.${MDT_DEV}.lfsck_namespace |
183 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
185 error "(4) unexpected status"
188 local repaired=$($SHOW_NAMESPACE |
189 awk '/^dirent_repaired/ { print $2 }')
190 # for interop with old server
191 [ -z "$repaired" ] &&
192 repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 1 ] ||
196 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
198 mount_client $MOUNT || error "(6) Fail to start client!"
200 #define OBD_FAIL_FID_LOOKUP 0x1505
201 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
202 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
206 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
210 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
211 skip "OI Scrub not implemented for ZFS" && return
215 #define OBD_FAIL_FID_INLMA 0x1502
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
217 touch $DIR/$tdir/dummy
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
221 #define OBD_FAIL_FID_NOLMA 0x1506
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
223 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
224 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
225 mdd.${MDT_DEV}.lfsck_namespace |
226 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
228 error "(4) unexpected status"
231 local repaired=$($SHOW_NAMESPACE |
232 awk '/^dirent_repaired/ { print $2 }')
233 # for interop with old server
234 [ -z "$repaired" ] &&
235 repaired=$($SHOW_NAMESPACE |
236 awk '/^updated_phase1/ { print $2 }')
238 [ $repaired -eq 1 ] ||
239 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
242 mount_client $MOUNT || error "(6) Fail to start client!"
244 #define OBD_FAIL_FID_LOOKUP 0x1505
245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
246 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
250 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
255 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
257 touch $DIR/$tdir/dummy
259 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
261 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
262 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
263 mdd.${MDT_DEV}.lfsck_namespace |
264 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
266 error "(4) unexpected status"
269 local repaired=$($SHOW_NAMESPACE |
270 awk '/^linkea_repaired/ { print $2 }')
271 # for interop with old server
272 [ -z "$repaired" ] &&
273 repaired=$($SHOW_NAMESPACE |
274 awk '/^updated_phase2/ { print $2 }')
276 [ $repaired -eq 1 ] ||
277 error "(5) Fail to repair crashed linkEA: $repaired"
279 mount_client $MOUNT || error "(6) Fail to start client!"
281 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
282 error "(7) Fail to stat $DIR/$tdir/dummy"
284 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
285 local dummyname=$($LFS fid2path $DIR $dummyfid)
286 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
287 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
289 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
295 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
297 touch $DIR/$tdir/dummy
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
303 mdd.${MDT_DEV}.lfsck_namespace |
304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
306 error "(4) unexpected status"
309 local repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase2/ { print $2 }')
311 [ $repaired -eq 1 ] ||
312 error "(5) Fail to repair crashed linkEA: $repaired"
314 mount_client $MOUNT || error "(6) Fail to start client!"
316 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
317 error "(7) Fail to stat $DIR/$tdir/dummy"
319 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
320 local dummyname=$($LFS fid2path $DIR $dummyfid)
321 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
322 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
324 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
330 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
332 touch $DIR/$tdir/dummy
334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
336 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
337 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
338 mdd.${MDT_DEV}.lfsck_namespace |
339 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
341 error "(4) unexpected status"
344 local repaired=$($SHOW_NAMESPACE |
345 awk '/^updated_phase2/ { print $2 }')
346 [ $repaired -eq 1 ] ||
347 error "(5) Fail to repair crashed linkEA: $repaired"
349 mount_client $MOUNT || error "(6) Fail to start client!"
351 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
352 error "(7) Fail to stat $DIR/$tdir/dummy"
354 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
355 local dummyname=$($LFS fid2path $DIR $dummyfid)
356 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
357 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
359 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
365 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
367 touch $DIR/$tdir/dummy
369 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
371 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
372 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
373 mdd.${MDT_DEV}.lfsck_namespace |
374 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
376 error "(4) unexpected status"
379 local repaired=$($SHOW_NAMESPACE |
380 awk '/^linkea_repaired/ { print $2 }')
381 [ $repaired -eq 1 ] ||
382 error "(5) Fail to repair crashed linkEA: $repaired"
384 mount_client $MOUNT || error "(6) Fail to start client!"
386 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
387 error "(7) Fail to stat $DIR/$tdir/dummy"
389 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
390 local dummyname=$($LFS fid2path $DIR $dummyfid)
391 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
392 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
394 run_test 2d "LFSCK can recover the missing linkEA entry"
398 [ $MDSCOUNT -lt 2 ] &&
399 skip "We need at least 2 MDSes for this test" && return
403 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
405 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
406 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
407 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
410 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
412 mdd.${MDT_DEV}.lfsck_namespace |
413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
415 error "(4) unexpected status"
418 local repaired=$($SHOW_NAMESPACE |
419 awk '/^linkea_repaired/ { print $2 }')
420 [ $repaired -eq 1 ] ||
421 error "(5) Fail to repair crashed linkEA: $repaired"
423 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
424 local name=$($LFS fid2path $DIR $fid)
425 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
426 error "(6) Fail to repair linkEA: $fid $name"
428 run_test 2e "namespace LFSCK can verify remote object linkEA"
434 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
435 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
436 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
438 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
439 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
440 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
442 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
444 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
446 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
448 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
452 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
453 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
454 mdd.${MDT_DEV}.lfsck_namespace |
455 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
457 error "(10) unexpected status"
460 local checked=$($SHOW_NAMESPACE |
461 awk '/^checked_phase2/ { print $2 }')
462 [ $checked -ge 4 ] ||
463 error "(11) Fail to check multiple-linked object: $checked"
465 local repaired=$($SHOW_NAMESPACE |
466 awk '/^multiple_linked_repaired/ { print $2 }')
467 [ $repaired -ge 2 ] ||
468 error "(12) Fail to repair multiple-linked object: $repaired"
470 run_test 3 "LFSCK can verify multiple-linked objects"
474 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
475 skip "OI Scrub not implemented for ZFS" && return
478 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
479 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
481 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
482 echo "start $SINGLEMDS with disabling OI scrub"
483 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
484 error "(2) Fail to start MDS!"
486 #define OBD_FAIL_LFSCK_DELAY2 0x1601
487 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
488 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
489 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
490 mdd.${MDT_DEV}.lfsck_namespace |
491 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
493 error "(5) unexpected status"
496 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
497 [ "$STATUS" == "scanning-phase1" ] ||
498 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
502 mdd.${MDT_DEV}.lfsck_namespace |
503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
505 error "(7) unexpected status"
508 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
509 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^dirent_repaired/ { print $2 }')
513 # for interop with old server
514 [ -z "$repaired" ] &&
515 repaired=$($SHOW_NAMESPACE |
516 awk '/^updated_phase1/ { print $2 }')
518 [ $repaired -ge 9 ] ||
519 error "(9) Fail to re-generate FID-in-dirent: $repaired"
521 mount_client $MOUNT || error "(10) Fail to start client!"
523 #define OBD_FAIL_FID_LOOKUP 0x1505
524 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
525 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
526 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
528 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
532 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
533 skip "OI Scrub not implemented for ZFS" && return
536 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
537 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
539 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
540 echo "start $SINGLEMDS with disabling OI scrub"
541 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
542 error "(2) Fail to start MDS!"
544 #define OBD_FAIL_LFSCK_DELAY2 0x1601
545 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
546 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
548 mdd.${MDT_DEV}.lfsck_namespace |
549 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
551 error "(5) unexpected status"
554 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
555 [ "$STATUS" == "scanning-phase1" ] ||
556 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
558 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
559 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
560 mdd.${MDT_DEV}.lfsck_namespace |
561 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
563 error "(7) unexpected status"
566 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
567 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
569 local repaired=$($SHOW_NAMESPACE |
570 awk '/^dirent_repaired/ { print $2 }')
571 # for interop with old server
572 [ -z "$repaired" ] &&
573 repaired=$($SHOW_NAMESPACE |
574 awk '/^updated_phase1/ { print $2 }')
576 [ $repaired -ge 2 ] ||
577 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
579 mount_client $MOUNT || error "(10) Fail to start client!"
581 #define OBD_FAIL_FID_LOOKUP 0x1505
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
583 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
585 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
588 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
589 local dummyname=$($LFS fid2path $DIR $dummyfid)
590 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
591 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
593 run_test 5 "LFSCK can handle IGIF object upgrading"
598 #define OBD_FAIL_LFSCK_DELAY1 0x1600
599 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
600 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
602 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
603 [ "$STATUS" == "scanning-phase1" ] ||
604 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
606 # Sleep 3 sec to guarantee at least one object processed by LFSCK
608 # Fail the LFSCK to guarantee there is at least one checkpoint
609 #define OBD_FAIL_LFSCK_FATAL1 0x1608
610 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
611 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
612 mdd.${MDT_DEV}.lfsck_namespace |
613 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
615 error "(4) unexpected status"
618 local POS0=$($SHOW_NAMESPACE |
619 awk '/^last_checkpoint_position/ { print $2 }' |
622 #define OBD_FAIL_LFSCK_DELAY1 0x1600
623 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
624 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
626 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
627 [ "$STATUS" == "scanning-phase1" ] ||
628 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
630 local POS1=$($SHOW_NAMESPACE |
631 awk '/^latest_start_position/ { print $2 }' |
633 [[ $POS0 -lt $POS1 ]] ||
634 error "(7) Expect larger than: $POS0, but got $POS1"
636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
637 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
638 mdd.${MDT_DEV}.lfsck_namespace |
639 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
641 error "(8) unexpected status"
644 run_test 6a "LFSCK resumes from last checkpoint (1)"
649 #define OBD_FAIL_LFSCK_DELAY2 0x1601
650 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
651 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
653 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
654 [ "$STATUS" == "scanning-phase1" ] ||
655 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
657 # Sleep 5 sec to guarantee that we are in the directory scanning
659 # Fail the LFSCK to guarantee there is at least one checkpoint
660 #define OBD_FAIL_LFSCK_FATAL2 0x1609
661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
662 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
663 mdd.${MDT_DEV}.lfsck_namespace |
664 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
666 error "(4) unexpected status"
669 local O_POS0=$($SHOW_NAMESPACE |
670 awk '/^last_checkpoint_position/ { print $2 }' |
673 local D_POS0=$($SHOW_NAMESPACE |
674 awk '/^last_checkpoint_position/ { print $4 }')
676 #define OBD_FAIL_LFSCK_DELAY2 0x1601
677 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
678 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
680 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
681 [ "$STATUS" == "scanning-phase1" ] ||
682 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
684 local O_POS1=$($SHOW_NAMESPACE |
685 awk '/^latest_start_position/ { print $2 }' |
687 local D_POS1=$($SHOW_NAMESPACE |
688 awk '/^latest_start_position/ { print $4 }')
690 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
691 [[ $O_POS0 -lt $O_POS1 ]] ||
692 error "(7.1) $O_POS1 is not larger than $O_POS0"
694 [[ $D_POS0 -lt $D_POS1 ]] ||
695 error "(7.2) $D_POS1 is not larger than $D_POS0"
698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
699 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
700 mdd.${MDT_DEV}.lfsck_namespace |
701 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
703 error "(8) unexpected status"
706 run_test 6b "LFSCK resumes from last checkpoint (2)"
713 #define OBD_FAIL_LFSCK_DELAY2 0x1601
714 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
715 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
717 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
718 [ "$STATUS" == "scanning-phase1" ] ||
719 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
721 # Sleep 3 sec to guarantee at least one object processed by LFSCK
723 echo "stop $SINGLEMDS"
724 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
727 echo "start $SINGLEMDS"
728 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
729 error "(5) Fail to start MDS!"
731 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
732 mdd.${MDT_DEV}.lfsck_namespace |
733 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
735 error "(6) unexpected status"
738 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
744 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
745 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
746 for ((i = 0; i < 20; i++)); do
747 touch $DIR/$tdir/dummy${i}
750 #define OBD_FAIL_LFSCK_DELAY3 0x1602
751 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
752 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
753 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
754 mdd.${MDT_DEV}.lfsck_namespace |
755 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
757 error "(4) unexpected status"
761 echo "stop $SINGLEMDS"
762 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
764 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
765 echo "start $SINGLEMDS"
766 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
767 error "(6) Fail to start MDS!"
769 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
770 mdd.${MDT_DEV}.lfsck_namespace |
771 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
773 error "(7) unexpected status"
776 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
781 formatall > /dev/null
787 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
788 [ "$STATUS" == "init" ] ||
789 error "(2) Expect 'init', but got '$STATUS'"
791 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
792 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
793 mkdir $DIR/$tdir/crashed
795 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
797 for ((i = 0; i < 5; i++)); do
798 touch $DIR/$tdir/dummy${i}
801 umount_client $MOUNT || error "(3) Fail to stop client!"
803 #define OBD_FAIL_LFSCK_DELAY2 0x1601
804 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
805 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
807 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
808 [ "$STATUS" == "scanning-phase1" ] ||
809 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
811 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
813 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
814 [ "$STATUS" == "stopped" ] ||
815 error "(7) Expect 'stopped', but got '$STATUS'"
817 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
819 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
820 [ "$STATUS" == "scanning-phase1" ] ||
821 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
823 #define OBD_FAIL_LFSCK_FATAL2 0x1609
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
829 error "(10) unexpected status"
832 #define OBD_FAIL_LFSCK_DELAY1 0x1600
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
834 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
836 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
837 [ "$STATUS" == "scanning-phase1" ] ||
838 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
840 #define OBD_FAIL_LFSCK_CRASH 0x160a
841 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
844 echo "stop $SINGLEMDS"
845 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
847 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
848 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
850 echo "start $SINGLEMDS"
851 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
852 error "(14) Fail to start MDS!"
854 local timeout=$(max_recovery_time)
857 while [ $timer -lt $timeout ]; do
858 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
859 mdt.${MDT_DEV}.recovery_status |
860 awk '/^status/ { print \\\$2 }'")
861 [ "$STATUS" != "RECOVERING" ] && break;
866 [ $timer != $timeout ] ||
867 error "(14.1) recovery timeout"
869 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
870 [ "$STATUS" == "crashed" ] ||
871 error "(15) Expect 'crashed', but got '$STATUS'"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
875 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
881 echo "stop $SINGLEMDS"
882 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
884 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
885 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(19) Fail to start MDS!"
892 while [ $timer -lt $timeout ]; do
893 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
894 mdt.${MDT_DEV}.recovery_status |
895 awk '/^status/ { print \\\$2 }'")
896 [ "$STATUS" != "RECOVERING" ] && break;
901 [ $timer != $timeout ] ||
902 error "(19.1) recovery timeout"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "paused" ] ||
906 error "(20) Expect 'paused', but got '$STATUS'"
908 #define OBD_FAIL_LFSCK_DELAY3 0x1602
909 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
911 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
913 mdd.${MDT_DEV}.lfsck_namespace |
914 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
916 error "(22) unexpected status"
919 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
920 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
921 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
924 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
925 mdd.${MDT_DEV}.lfsck_namespace |
926 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
928 error "(24) unexpected status"
931 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
932 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
934 run_test 8 "LFSCK state machine"
937 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
938 skip "Testing on UP system, the speed may be inaccurate."
942 [[ $server_version -ge $(version_code 2.7.50) ]] ||
943 { skip "Need MDS version >= 2.7.50"; return; }
946 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
947 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
948 createmany -o $DIR/$tdir/lfsck/f 5000
950 local BASE_SPEED1=100
952 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
955 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
956 [ "$STATUS" == "scanning-phase1" ] ||
957 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
959 local SPEED=$($SHOW_LAYOUT |
960 awk '/^average_speed_phase1/ { print $2 }')
962 # There may be time error, normally it should be less than 2 seconds.
963 # We allow another 20% schedule error.
965 # MAX_MARGIN = 1.2 = 12 / 10
966 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
967 RUN_TIME1 * 12 / 10))
968 [ $SPEED -lt $MAX_SPEED ] ||
969 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
972 local BASE_SPEED2=300
974 do_facet $SINGLEMDS \
975 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
978 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
979 # MIN_MARGIN = 0.8 = 8 / 10
980 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
981 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
982 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
983 [ $SPEED -gt $MIN_SPEED ] || {
984 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
985 error_ignore LU-5624 \
986 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
989 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
993 # MAX_MARGIN = 1.2 = 12 / 10
994 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
995 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
996 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
997 [ $SPEED -lt $MAX_SPEED ] ||
998 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1000 do_facet $SINGLEMDS \
1001 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1003 wait_update_facet $SINGLEMDS \
1004 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1005 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1006 error "(7) Failed to get expected 'completed'"
1008 run_test 9a "LFSCK speed control (1)"
1011 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1012 skip "Testing on UP system, the speed may be inaccurate."
1016 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1017 { skip "Need MDS version >= 2.7.50"; return; }
1021 echo "Preparing another 50 * 50 files (with error) at $(date)."
1022 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1023 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1024 createmany -d $DIR/$tdir/d 50
1025 createmany -m $DIR/$tdir/f 50
1026 for ((i = 0; i < 50; i++)); do
1027 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1030 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1032 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1034 mdd.${MDT_DEV}.lfsck_namespace |
1035 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1037 error "(5) unexpected status"
1040 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1041 echo "Prepared at $(date)."
1043 local BASE_SPEED1=50
1045 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1048 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1049 [ "$STATUS" == "scanning-phase2" ] ||
1050 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1052 local SPEED=$($SHOW_NAMESPACE |
1053 awk '/^average_speed_phase2/ { print $2 }')
1054 # There may be time error, normally it should be less than 2 seconds.
1055 # We allow another 20% schedule error.
1057 # MAX_MARGIN = 1.2 = 12 / 10
1058 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1059 RUN_TIME1 * 12 / 10))
1060 [ $SPEED -lt $MAX_SPEED ] ||
1061 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1063 # adjust speed limit
1064 local BASE_SPEED2=150
1066 do_facet $SINGLEMDS \
1067 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1070 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1071 # MIN_MARGIN = 0.8 = 8 / 10
1072 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1073 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1074 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1075 [ $SPEED -gt $MIN_SPEED ] || {
1076 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1077 error_ignore LU-5624 \
1078 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1081 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1085 # MAX_MARGIN = 1.2 = 12 / 10
1086 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1087 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1088 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1089 [ $SPEED -lt $MAX_SPEED ] ||
1090 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1092 do_facet $SINGLEMDS \
1093 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1095 mdd.${MDT_DEV}.lfsck_namespace |
1096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1098 error "(11) unexpected status"
1101 run_test 9b "LFSCK speed control (2)"
1105 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1106 skip "lookup(..)/linkea on ZFS issue" && return
1110 echo "Preparing more files with error at $(date)."
1111 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1112 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1114 for ((i = 0; i < 1000; i = $((i+2)))); do
1115 mkdir -p $DIR/$tdir/d${i}
1116 touch $DIR/$tdir/f${i}
1117 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1120 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1121 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1123 for ((i = 1; i < 1000; i = $((i+2)))); do
1124 mkdir -p $DIR/$tdir/d${i}
1125 touch $DIR/$tdir/f${i}
1126 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1129 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1130 echo "Prepared at $(date)."
1132 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1134 umount_client $MOUNT
1135 mount_client $MOUNT || error "(3) Fail to start client!"
1137 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1140 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1141 [ "$STATUS" == "scanning-phase1" ] ||
1142 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1144 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1146 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1148 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1150 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1152 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1154 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1156 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1158 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1159 error "(14) Fail to softlink!"
1161 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1162 [ "$STATUS" == "scanning-phase1" ] ||
1163 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1165 do_facet $SINGLEMDS \
1166 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1167 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1168 mdd.${MDT_DEV}.lfsck_namespace |
1169 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1171 error "(16) unexpected status"
1174 run_test 10 "System is available during LFSCK scanning"
1177 ost_remove_lastid() {
1180 local rcmd="do_facet ost${ost}"
1182 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1184 # step 1: local mount
1185 mount_fstype ost${ost} || return 1
1186 # step 2: remove the specified LAST_ID
1187 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1189 unmount_fstype ost${ost} || return 2
1193 check_mount_and_prep
1194 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1195 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1200 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1202 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1203 error "(2) Fail to start ost1"
1205 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1206 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1208 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1209 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1211 wait_update_facet ost1 "$LCTL get_param -n \
1212 obdfilter.${OST_DEV}.lfsck_layout |
1213 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1215 error "(5) unexpected status"
1218 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1220 wait_update_facet ost1 "$LCTL get_param -n \
1221 obdfilter.${OST_DEV}.lfsck_layout |
1222 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1224 error "(6) unexpected status"
1227 echo "the LAST_ID(s) should have been rebuilt"
1228 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1229 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1231 run_test 11a "LFSCK can rebuild lost last_id"
1234 check_mount_and_prep
1235 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1237 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1238 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1239 do_facet ost1 $LCTL set_param fail_loc=0x160d
1241 local count=$(precreated_ost_obj_count 0 0)
1243 createmany -o $DIR/$tdir/f $((count + 32))
1245 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1246 local seq=$(do_facet mds1 $LCTL get_param -n \
1247 osp.${proc_path}.prealloc_last_seq)
1248 local lastid1=$(do_facet ost1 "lctl get_param -n \
1249 obdfilter.${ost1_svc}.last_id" | grep $seq |
1250 awk -F: '{ print $2 }')
1252 umount_client $MOUNT
1253 stop ost1 || error "(1) Fail to stop ost1"
1255 #define OBD_FAIL_OST_ENOSPC 0x215
1256 do_facet ost1 $LCTL set_param fail_loc=0x215
1258 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1259 error "(2) Fail to start ost1"
1261 for ((i = 0; i < 60; i++)); do
1262 lastid2=$(do_facet ost1 "lctl get_param -n \
1263 obdfilter.${ost1_svc}.last_id" | grep $seq |
1264 awk -F: '{ print $2 }')
1265 [ ! -z $lastid2 ] && break;
1269 echo "the on-disk LAST_ID should be smaller than the expected one"
1270 [ $lastid1 -gt $lastid2 ] ||
1271 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1273 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1274 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1276 wait_update_facet ost1 "$LCTL get_param -n \
1277 obdfilter.${OST_DEV}.lfsck_layout |
1278 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1280 error "(6) unexpected status"
1283 stop ost1 || error "(7) Fail to stop ost1"
1285 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1286 error "(8) Fail to start ost1"
1288 echo "the on-disk LAST_ID should have been rebuilt"
1289 wait_update_facet ost1 "$LCTL get_param -n \
1290 obdfilter.${ost1_svc}.last_id | grep $seq |
1291 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1292 do_facet ost1 $LCTL get_param -n \
1293 obdfilter.${ost1_svc}.last_id
1294 error "(9) expect lastid1 $seq:$lastid1"
1297 do_facet ost1 $LCTL set_param fail_loc=0
1298 stopall || error "(10) Fail to stopall"
1300 run_test 11b "LFSCK can rebuild crashed last_id"
1303 [ $MDSCOUNT -lt 2 ] &&
1304 skip "We need at least 2 MDSes for test_12" && return
1306 check_mount_and_prep
1307 for k in $(seq $MDSCOUNT); do
1308 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1309 createmany -o $DIR/$tdir/${k}/f 100 ||
1310 error "(0) Fail to create 100 files."
1313 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1314 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1315 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1317 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1318 for k in $(seq $MDSCOUNT); do
1319 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1320 mdd.$(facet_svc mds${k}).lfsck_namespace |
1321 awk '/^status/ { print $2 }')
1322 [ "$STATUS" == "scanning-phase1" ] ||
1323 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1326 echo "Stop namespace LFSCK on all targets by single lctl command."
1327 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1328 error "(4) Fail to stop LFSCK on all devices!"
1330 echo "All the LFSCK targets should be in 'stopped' status."
1331 for k in $(seq $MDSCOUNT); do
1332 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1333 mdd.$(facet_svc mds${k}).lfsck_namespace |
1334 awk '/^status/ { print $2 }')
1335 [ "$STATUS" == "stopped" ] ||
1336 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1339 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1340 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1341 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1343 echo "All the LFSCK targets should be in 'completed' status."
1344 for k in $(seq $MDSCOUNT); do
1345 wait_update_facet mds${k} "$LCTL get_param -n \
1346 mdd.$(facet_svc mds${k}).lfsck_namespace |
1347 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1348 error "(7) MDS${k} is not the expected 'completed'"
1351 start_full_debug_logging
1353 echo "Start layout LFSCK on all targets by single command (-s 1)."
1354 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1355 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1357 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1358 for k in $(seq $MDSCOUNT); do
1359 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1360 mdd.$(facet_svc mds${k}).lfsck_layout |
1361 awk '/^status/ { print $2 }')
1362 [ "$STATUS" == "scanning-phase1" ] ||
1363 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1366 echo "Stop layout LFSCK on all targets by single lctl command."
1367 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1368 error "(10) Fail to stop LFSCK on all devices!"
1370 echo "All the LFSCK targets should be in 'stopped' status."
1371 for k in $(seq $MDSCOUNT); do
1372 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1373 mdd.$(facet_svc mds${k}).lfsck_layout |
1374 awk '/^status/ { print $2 }')
1375 [ "$STATUS" == "stopped" ] ||
1376 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1379 for k in $(seq $OSTCOUNT); do
1380 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1381 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1382 awk '/^status/ { print $2 }')
1383 [ "$STATUS" == "stopped" ] ||
1384 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1387 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1388 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1389 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1391 echo "All the LFSCK targets should be in 'completed' status."
1392 for k in $(seq $MDSCOUNT); do
1393 # The LFSCK status query internal is 30 seconds. For the case
1394 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1395 # time to guarantee the status sync up.
1396 wait_update_facet mds${k} "$LCTL get_param -n \
1397 mdd.$(facet_svc mds${k}).lfsck_layout |
1398 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1399 error "(14) MDS${k} is not the expected 'completed'"
1402 stop_full_debug_logging
1404 run_test 12 "single command to trigger LFSCK on all devices"
1408 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1409 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1410 echo "MDT-object FID."
1413 check_mount_and_prep
1415 echo "Inject failure stub to simulate bad lmm_oi"
1416 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1417 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1418 createmany -o $DIR/$tdir/f 32
1419 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1421 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1422 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1424 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1425 mdd.${MDT_DEV}.lfsck_layout |
1426 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1428 error "(2) unexpected status"
1431 local repaired=$($SHOW_LAYOUT |
1432 awk '/^repaired_others/ { print $2 }')
1433 [ $repaired -eq 32 ] ||
1434 error "(3) Fail to repair crashed lmm_oi: $repaired"
1436 run_test 13 "LFSCK can repair crashed lmm_oi"
1440 echo "The OST-object referenced by the MDT-object should be there;"
1441 echo "otherwise, the LFSCK should re-create the missing OST-object."
1444 check_mount_and_prep
1445 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1447 echo "Inject failure stub to simulate dangling referenced MDT-object"
1448 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1449 do_facet ost1 $LCTL set_param fail_loc=0x1610
1450 local count=$(precreated_ost_obj_count 0 0)
1452 createmany -o $DIR/$tdir/f $((count + 31))
1453 touch $DIR/$tdir/guard
1454 do_facet ost1 $LCTL set_param fail_loc=0
1456 start_full_debug_logging
1458 # exhaust other pre-created dangling cases
1459 count=$(precreated_ost_obj_count 0 0)
1460 createmany -o $DIR/$tdir/a $count ||
1461 error "(0) Fail to create $count files."
1463 echo "'ls' should fail because of dangling referenced MDT-object"
1464 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1466 echo "Trigger layout LFSCK to find out dangling reference"
1467 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1469 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1470 mdd.${MDT_DEV}.lfsck_layout |
1471 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1473 error "(3) unexpected status"
1476 local repaired=$($SHOW_LAYOUT |
1477 awk '/^repaired_dangling/ { print $2 }')
1478 [ $repaired -ge 32 ] ||
1479 error "(4) Fail to repair dangling reference: $repaired"
1481 echo "'stat' should fail because of not repair dangling by default"
1482 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1484 echo "Trigger layout LFSCK to repair dangling reference"
1485 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1487 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1488 mdd.${MDT_DEV}.lfsck_layout |
1489 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1491 error "(7) unexpected status"
1494 # There may be some async LFSCK updates in processing, wait for
1495 # a while until the target reparation has been done. LU-4970.
1497 echo "'stat' should success after layout LFSCK repairing"
1498 wait_update_facet client "stat $DIR/$tdir/guard |
1499 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1500 stat $DIR/$tdir/guard
1502 error "(8) unexpected size"
1505 repaired=$($SHOW_LAYOUT |
1506 awk '/^repaired_dangling/ { print $2 }')
1507 [ $repaired -ge 32 ] ||
1508 error "(9) Fail to repair dangling reference: $repaired"
1510 stop_full_debug_logging
1512 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1516 echo "If the OST-object referenced by the MDT-object back points"
1517 echo "to some non-exist MDT-object, then the LFSCK should repair"
1518 echo "the OST-object to back point to the right MDT-object."
1521 check_mount_and_prep
1522 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1524 echo "Inject failure stub to make the OST-object to back point to"
1525 echo "non-exist MDT-object."
1526 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1528 do_facet ost1 $LCTL set_param fail_loc=0x1611
1529 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1530 cancel_lru_locks osc
1531 do_facet ost1 $LCTL set_param fail_loc=0
1533 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1534 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1537 mdd.${MDT_DEV}.lfsck_layout |
1538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1540 error "(2) unexpected status"
1543 local repaired=$($SHOW_LAYOUT |
1544 awk '/^repaired_unmatched_pair/ { print $2 }')
1545 [ $repaired -eq 1 ] ||
1546 error "(3) Fail to repair unmatched pair: $repaired"
1548 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1552 echo "If the OST-object referenced by the MDT-object back points"
1553 echo "to other MDT-object that doesn't recognize the OST-object,"
1554 echo "then the LFSCK should repair it to back point to the right"
1555 echo "MDT-object (the first one)."
1558 check_mount_and_prep
1559 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1560 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1561 cancel_lru_locks osc
1563 echo "Inject failure stub to make the OST-object to back point to"
1564 echo "other MDT-object"
1566 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1567 do_facet ost1 $LCTL set_param fail_loc=0x1612
1568 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1569 cancel_lru_locks osc
1570 do_facet ost1 $LCTL set_param fail_loc=0
1572 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1573 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1575 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1576 mdd.${MDT_DEV}.lfsck_layout |
1577 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1579 error "(2) unexpected status"
1582 local repaired=$($SHOW_LAYOUT |
1583 awk '/^repaired_unmatched_pair/ { print $2 }')
1584 [ $repaired -eq 1 ] ||
1585 error "(3) Fail to repair unmatched pair: $repaired"
1587 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1590 [ $MDSCOUNT -lt 2 ] &&
1591 skip "We need at least 2 MDSes for this test" && return
1594 echo "According to current metadata migration implementation,"
1595 echo "before the old MDT-object is removed, both the new MDT-object"
1596 echo "and old MDT-object will reference the same LOV layout. Then if"
1597 echo "the layout LFSCK finds the new MDT-object by race, it will"
1598 echo "regard related OST-object(s) as multiple referenced case, and"
1599 echo "will try to create new OST-object(s) for the new MDT-object."
1600 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1601 echo "MDT-object before confirm the multiple referenced case."
1604 check_mount_and_prep
1605 $LFS mkdir -i 1 $DIR/$tdir/a1
1606 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1607 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1608 cancel_lru_locks osc
1610 echo "Inject failure stub on MDT1 to delay the migration"
1612 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1613 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1614 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1615 $LFS migrate -m 0 $DIR/$tdir/a1 &
1618 echo "Trigger layout LFSCK to race with the migration"
1619 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1621 for k in $(seq $MDSCOUNT); do
1622 # The LFSCK status query internal is 30 seconds. For the case
1623 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1624 # time to guarantee the status sync up.
1625 wait_update_facet mds${k} "$LCTL get_param -n \
1626 mdd.$(facet_svc mds${k}).lfsck_layout |
1627 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1628 error "(2) MDS${k} is not the expected 'completed'"
1631 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1632 local repaired=$($SHOW_LAYOUT |
1633 awk '/^repaired_unmatched_pair/ { print $2 }')
1634 [ $repaired -eq 1 ] ||
1635 error "(3) Fail to repair unmatched pair: $repaired"
1637 repaired=$($SHOW_LAYOUT |
1638 awk '/^repaired_multiple_referenced/ { print $2 }')
1639 [ $repaired -eq 0 ] ||
1640 error "(4) Unexpectedly repaird multiple references: $repaired"
1642 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1646 echo "If the OST-object's owner information does not match the owner"
1647 echo "information stored in the MDT-object, then the LFSCK trust the"
1648 echo "MDT-object and update the OST-object's owner information."
1651 check_mount_and_prep
1652 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1653 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1654 cancel_lru_locks osc
1656 echo "Inject failure stub to skip OST-object owner changing"
1657 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1658 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1659 chown 1.1 $DIR/$tdir/f0
1660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1662 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1665 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1667 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1668 mdd.${MDT_DEV}.lfsck_layout |
1669 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1671 error "(2) unexpected status"
1674 local repaired=$($SHOW_LAYOUT |
1675 awk '/^repaired_inconsistent_owner/ { print $2 }')
1676 [ $repaired -eq 1 ] ||
1677 error "(3) Fail to repair inconsistent owner: $repaired"
1679 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1683 echo "If more than one MDT-objects reference the same OST-object,"
1684 echo "and the OST-object only recognizes one MDT-object, then the"
1685 echo "LFSCK should create new OST-objects for such non-recognized"
1689 check_mount_and_prep
1690 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1692 echo "Inject failure stub to make two MDT-objects to refernce"
1693 echo "the OST-object"
1695 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1696 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1698 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1699 cancel_lru_locks osc
1701 createmany -o $DIR/$tdir/f 1
1703 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1705 cancel_lru_locks mdc
1706 cancel_lru_locks osc
1708 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1709 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1710 [ $size -eq 1048576 ] ||
1711 error "(1) f0 (wrong) size should be 1048576, but got $size"
1713 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1716 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1718 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1719 mdd.${MDT_DEV}.lfsck_layout |
1720 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1722 error "(3) unexpected status"
1725 local repaired=$($SHOW_LAYOUT |
1726 awk '/^repaired_multiple_referenced/ { print $2 }')
1727 [ $repaired -eq 1 ] ||
1728 error "(4) Fail to repair multiple references: $repaired"
1730 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1731 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1732 error "(5) Fail to write f0."
1733 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1734 [ $size -eq 1048576 ] ||
1735 error "(6) guard size should be 1048576, but got $size"
1737 run_test 17 "LFSCK can repair multiple references"
1739 $LCTL set_param debug=+cache > /dev/null
1743 echo "The target MDT-object is there, but related stripe information"
1744 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1745 echo "layout EA entries."
1748 check_mount_and_prep
1749 $LFS mkdir -i 0 $DIR/$tdir/a1
1750 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1751 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1753 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1755 $LFS path2fid $DIR/$tdir/a1/f1
1756 $LFS getstripe $DIR/$tdir/a1/f1
1758 if [ $MDSCOUNT -ge 2 ]; then
1759 $LFS mkdir -i 1 $DIR/$tdir/a2
1760 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1761 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1762 $LFS path2fid $DIR/$tdir/a2/f2
1763 $LFS getstripe $DIR/$tdir/a2/f2
1766 cancel_lru_locks osc
1768 echo "Inject failure, to make the MDT-object lost its layout EA"
1769 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1770 do_facet mds1 $LCTL set_param fail_loc=0x1615
1771 chown 1.1 $DIR/$tdir/a1/f1
1773 if [ $MDSCOUNT -ge 2 ]; then
1774 do_facet mds2 $LCTL set_param fail_loc=0x1615
1775 chown 1.1 $DIR/$tdir/a2/f2
1781 do_facet mds1 $LCTL set_param fail_loc=0
1782 if [ $MDSCOUNT -ge 2 ]; then
1783 do_facet mds2 $LCTL set_param fail_loc=0
1786 cancel_lru_locks mdc
1787 cancel_lru_locks osc
1789 echo "The file size should be incorrect since layout EA is lost"
1790 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1791 [ "$cur_size" != "$saved_size" ] ||
1792 error "(1) Expect incorrect file1 size"
1794 if [ $MDSCOUNT -ge 2 ]; then
1795 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1796 [ "$cur_size" != "$saved_size" ] ||
1797 error "(2) Expect incorrect file2 size"
1800 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1801 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1803 for k in $(seq $MDSCOUNT); do
1804 # The LFSCK status query internal is 30 seconds. For the case
1805 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1806 # time to guarantee the status sync up.
1807 wait_update_facet mds${k} "$LCTL get_param -n \
1808 mdd.$(facet_svc mds${k}).lfsck_layout |
1809 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1810 error "(4) MDS${k} is not the expected 'completed'"
1813 for k in $(seq $OSTCOUNT); do
1814 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1815 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1816 awk '/^status/ { print $2 }')
1817 [ "$cur_status" == "completed" ] ||
1818 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1821 local repaired=$(do_facet mds1 $LCTL get_param -n \
1822 mdd.$(facet_svc mds1).lfsck_layout |
1823 awk '/^repaired_orphan/ { print $2 }')
1824 [ $repaired -eq 1 ] ||
1825 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1827 if [ $MDSCOUNT -ge 2 ]; then
1828 repaired=$(do_facet mds2 $LCTL get_param -n \
1829 mdd.$(facet_svc mds2).lfsck_layout |
1830 awk '/^repaired_orphan/ { print $2 }')
1831 [ $repaired -eq 2 ] ||
1832 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1835 $LFS path2fid $DIR/$tdir/a1/f1
1836 $LFS getstripe $DIR/$tdir/a1/f1
1838 if [ $MDSCOUNT -ge 2 ]; then
1839 $LFS path2fid $DIR/$tdir/a2/f2
1840 $LFS getstripe $DIR/$tdir/a2/f2
1843 echo "The file size should be correct after layout LFSCK scanning"
1844 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1845 [ "$cur_size" == "$saved_size" ] ||
1846 error "(7) Expect file1 size $saved_size, but got $cur_size"
1848 if [ $MDSCOUNT -ge 2 ]; then
1849 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1850 [ "$cur_size" == "$saved_size" ] ||
1851 error "(8) Expect file2 size $saved_size, but got $cur_size"
1854 run_test 18a "Find out orphan OST-object and repair it (1)"
1858 echo "The target MDT-object is lost. The LFSCK should re-create the"
1859 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1860 echo "can move it back to normal namespace manually."
1863 check_mount_and_prep
1864 $LFS mkdir -i 0 $DIR/$tdir/a1
1865 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1866 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1867 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1868 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1870 $LFS getstripe $DIR/$tdir/a1/f1
1872 if [ $MDSCOUNT -ge 2 ]; then
1873 $LFS mkdir -i 1 $DIR/$tdir/a2
1874 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1875 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1876 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1878 $LFS getstripe $DIR/$tdir/a2/f2
1881 cancel_lru_locks osc
1883 echo "Inject failure, to simulate the case of missing the MDT-object"
1884 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1885 do_facet mds1 $LCTL set_param fail_loc=0x1616
1886 rm -f $DIR/$tdir/a1/f1
1888 if [ $MDSCOUNT -ge 2 ]; then
1889 do_facet mds2 $LCTL set_param fail_loc=0x1616
1890 rm -f $DIR/$tdir/a2/f2
1896 do_facet mds1 $LCTL set_param fail_loc=0
1897 if [ $MDSCOUNT -ge 2 ]; then
1898 do_facet mds2 $LCTL set_param fail_loc=0
1901 cancel_lru_locks mdc
1902 cancel_lru_locks osc
1904 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1905 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1907 for k in $(seq $MDSCOUNT); do
1908 # The LFSCK status query internal is 30 seconds. For the case
1909 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1910 # time to guarantee the status sync up.
1911 wait_update_facet mds${k} "$LCTL get_param -n \
1912 mdd.$(facet_svc mds${k}).lfsck_layout |
1913 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1914 error "(2) MDS${k} is not the expected 'completed'"
1917 for k in $(seq $OSTCOUNT); do
1918 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1919 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1920 awk '/^status/ { print $2 }')
1921 [ "$cur_status" == "completed" ] ||
1922 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1925 local repaired=$(do_facet mds1 $LCTL get_param -n \
1926 mdd.$(facet_svc mds1).lfsck_layout |
1927 awk '/^repaired_orphan/ { print $2 }')
1928 [ $repaired -eq 1 ] ||
1929 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1931 if [ $MDSCOUNT -ge 2 ]; then
1932 repaired=$(do_facet mds2 $LCTL get_param -n \
1933 mdd.$(facet_svc mds2).lfsck_layout |
1934 awk '/^repaired_orphan/ { print $2 }')
1935 [ $repaired -eq 2 ] ||
1936 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1939 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1940 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1941 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1943 if [ $MDSCOUNT -ge 2 ]; then
1944 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1945 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1948 $LFS path2fid $DIR/$tdir/a1/f1
1949 $LFS getstripe $DIR/$tdir/a1/f1
1951 if [ $MDSCOUNT -ge 2 ]; then
1952 $LFS path2fid $DIR/$tdir/a2/f2
1953 $LFS getstripe $DIR/$tdir/a2/f2
1956 echo "The file size should be correct after layout LFSCK scanning"
1957 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1958 [ "$cur_size" == "$saved_size" ] ||
1959 error "(7) Expect file1 size $saved_size, but got $cur_size"
1961 if [ $MDSCOUNT -ge 2 ]; then
1962 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1963 [ "$cur_size" == "$saved_size" ] ||
1964 error "(8) Expect file2 size $saved_size, but got $cur_size"
1967 run_test 18b "Find out orphan OST-object and repair it (2)"
1971 echo "The target MDT-object is lost, and the OST-object FID is missing."
1972 echo "The LFSCK should re-create the MDT-object with new FID under the "
1973 echo "directory .lustre/lost+found/MDTxxxx."
1976 check_mount_and_prep
1977 $LFS mkdir -i 0 $DIR/$tdir/a1
1978 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1980 echo "Inject failure, to simulate the case of missing parent FID"
1981 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1982 do_facet ost1 $LCTL set_param fail_loc=0x1617
1984 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1985 $LFS getstripe $DIR/$tdir/a1/f1
1987 if [ $MDSCOUNT -ge 2 ]; then
1988 $LFS mkdir -i 1 $DIR/$tdir/a2
1989 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1990 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1991 $LFS getstripe $DIR/$tdir/a2/f2
1994 cancel_lru_locks osc
1996 echo "Inject failure, to simulate the case of missing the MDT-object"
1997 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1998 do_facet mds1 $LCTL set_param fail_loc=0x1616
1999 rm -f $DIR/$tdir/a1/f1
2001 if [ $MDSCOUNT -ge 2 ]; then
2002 do_facet mds2 $LCTL set_param fail_loc=0x1616
2003 rm -f $DIR/$tdir/a2/f2
2009 do_facet mds1 $LCTL set_param fail_loc=0
2010 if [ $MDSCOUNT -ge 2 ]; then
2011 do_facet mds2 $LCTL set_param fail_loc=0
2014 cancel_lru_locks mdc
2015 cancel_lru_locks osc
2017 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2018 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2020 for k in $(seq $MDSCOUNT); do
2021 # The LFSCK status query internal is 30 seconds. For the case
2022 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2023 # time to guarantee the status sync up.
2024 wait_update_facet mds${k} "$LCTL get_param -n \
2025 mdd.$(facet_svc mds${k}).lfsck_layout |
2026 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2027 error "(2) MDS${k} is not the expected 'completed'"
2030 for k in $(seq $OSTCOUNT); do
2031 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2032 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2033 awk '/^status/ { print $2 }')
2034 [ "$cur_status" == "completed" ] ||
2035 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2038 if [ $MDSCOUNT -ge 2 ]; then
2044 local repaired=$(do_facet mds1 $LCTL get_param -n \
2045 mdd.$(facet_svc mds1).lfsck_layout |
2046 awk '/^repaired_orphan/ { print $2 }')
2047 [ $repaired -eq $expected ] ||
2048 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2050 if [ $MDSCOUNT -ge 2 ]; then
2051 repaired=$(do_facet mds2 $LCTL get_param -n \
2052 mdd.$(facet_svc mds2).lfsck_layout |
2053 awk '/^repaired_orphan/ { print $2 }')
2054 [ $repaired -eq 0 ] ||
2055 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2058 ls -ail $MOUNT/.lustre/lost+found/
2060 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2061 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2062 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2064 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2067 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2068 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2069 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2071 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2072 [ ! -z "$cname" ] ||
2073 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2075 run_test 18c "Find out orphan OST-object and repair it (3)"
2079 echo "The target MDT-object layout EA slot is occpuied by some new"
2080 echo "created OST-object when repair dangling reference case. Such"
2081 echo "conflict OST-object has never been modified. Then when found"
2082 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2086 check_mount_and_prep
2088 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2089 echo "guard" > $DIR/$tdir/a1/f1
2090 echo "foo" > $DIR/$tdir/a1/f2
2091 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2092 $LFS path2fid $DIR/$tdir/a1/f1
2093 $LFS getstripe $DIR/$tdir/a1/f1
2094 $LFS path2fid $DIR/$tdir/a1/f2
2095 $LFS getstripe $DIR/$tdir/a1/f2
2096 cancel_lru_locks osc
2098 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2099 echo "to reference the same OST-object (which is f1's OST-obejct)."
2100 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2101 echo "dangling reference case, but f2's old OST-object is there."
2104 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2106 chown 1.1 $DIR/$tdir/a1/f2
2107 rm -f $DIR/$tdir/a1/f1
2110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2112 echo "stopall to cleanup object cache"
2115 setupall > /dev/null
2117 echo "The file size should be incorrect since dangling referenced"
2118 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2119 [ "$cur_size" != "$saved_size" ] ||
2120 error "(1) Expect incorrect file2 size"
2122 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2123 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2125 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2126 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2128 wait_update_facet mds1 "$LCTL get_param -n \
2129 mdd.$(facet_svc mds1).lfsck_layout |
2130 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2131 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2133 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2135 for k in $(seq $MDSCOUNT); do
2136 # The LFSCK status query internal is 30 seconds. For the case
2137 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2138 # time to guarantee the status sync up.
2139 wait_update_facet mds${k} "$LCTL get_param -n \
2140 mdd.$(facet_svc mds${k}).lfsck_layout |
2141 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2142 error "(3) MDS${k} is not the expected 'completed'"
2145 for k in $(seq $OSTCOUNT); do
2146 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2147 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2148 awk '/^status/ { print $2 }')
2149 [ "$cur_status" == "completed" ] ||
2150 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2153 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2154 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2155 awk '/^repaired_orphan/ { print $2 }')
2156 [ $repaired -eq 1 ] ||
2157 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2159 echo "The file size should be correct after layout LFSCK scanning"
2160 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2161 [ "$cur_size" == "$saved_size" ] ||
2162 error "(6) Expect file2 size $saved_size, but got $cur_size"
2164 echo "The LFSCK should find back the original data."
2165 cat $DIR/$tdir/a1/f2
2166 $LFS path2fid $DIR/$tdir/a1/f2
2167 $LFS getstripe $DIR/$tdir/a1/f2
2169 run_test 18d "Find out orphan OST-object and repair it (4)"
2173 echo "The target MDT-object layout EA slot is occpuied by some new"
2174 echo "created OST-object when repair dangling reference case. Such"
2175 echo "conflict OST-object has been modified by others. To keep the"
2176 echo "new data, the LFSCK will create a new file to refernece this"
2177 echo "old orphan OST-object."
2180 check_mount_and_prep
2182 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2183 echo "guard" > $DIR/$tdir/a1/f1
2184 echo "foo" > $DIR/$tdir/a1/f2
2185 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2186 $LFS path2fid $DIR/$tdir/a1/f1
2187 $LFS getstripe $DIR/$tdir/a1/f1
2188 $LFS path2fid $DIR/$tdir/a1/f2
2189 $LFS getstripe $DIR/$tdir/a1/f2
2190 cancel_lru_locks osc
2192 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2193 echo "to reference the same OST-object (which is f1's OST-obejct)."
2194 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2195 echo "dangling reference case, but f2's old OST-object is there."
2198 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2199 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2200 chown 1.1 $DIR/$tdir/a1/f2
2201 rm -f $DIR/$tdir/a1/f1
2204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2206 echo "stopall to cleanup object cache"
2209 setupall > /dev/null
2211 echo "The file size should be incorrect since dangling referenced"
2212 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2213 [ "$cur_size" != "$saved_size" ] ||
2214 error "(1) Expect incorrect file2 size"
2216 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2217 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2219 start_full_debug_logging
2221 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2222 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2224 wait_update_facet mds1 "$LCTL get_param -n \
2225 mdd.$(facet_svc mds1).lfsck_layout |
2226 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2227 error "(3) MDS1 is not the expected 'scanning-phase2'"
2229 # to guarantee all updates are synced.
2233 echo "Write new data to f2 to modify the new created OST-object."
2234 echo "dummy" >> $DIR/$tdir/a1/f2
2236 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2238 for k in $(seq $MDSCOUNT); do
2239 # The LFSCK status query internal is 30 seconds. For the case
2240 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2241 # time to guarantee the status sync up.
2242 wait_update_facet mds${k} "$LCTL get_param -n \
2243 mdd.$(facet_svc mds${k}).lfsck_layout |
2244 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2245 error "(4) MDS${k} is not the expected 'completed'"
2248 for k in $(seq $OSTCOUNT); do
2249 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2250 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2251 awk '/^status/ { print $2 }')
2252 [ "$cur_status" == "completed" ] ||
2253 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2256 stop_full_debug_logging
2258 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2259 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2260 awk '/^repaired_orphan/ { print $2 }')
2261 [ $repaired -eq 1 ] ||
2262 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2264 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2265 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2266 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2268 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2269 [ ! -z "$cname" ] ||
2270 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2272 echo "The stub file should keep the original f2 data"
2273 cur_size=$(ls -il $cname | awk '{ print $6 }')
2274 [ "$cur_size" == "$saved_size" ] ||
2275 error "(9) Expect file2 size $saved_size, but got $cur_size"
2278 $LFS path2fid $cname
2279 $LFS getstripe $cname
2281 echo "The f2 should contains new data."
2282 cat $DIR/$tdir/a1/f2
2283 $LFS path2fid $DIR/$tdir/a1/f2
2284 $LFS getstripe $DIR/$tdir/a1/f2
2286 run_test 18e "Find out orphan OST-object and repair it (5)"
2289 [ $OSTCOUNT -lt 2 ] &&
2290 skip "The test needs at least 2 OSTs" && return
2293 echo "The target MDT-object is lost. The LFSCK should re-create the"
2294 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2295 echo "to verify some OST-object(s) during the first stage-scanning,"
2296 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2297 echo "should not be affected."
2300 check_mount_and_prep
2301 $LFS mkdir -i 0 $DIR/$tdir/a1
2302 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2303 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2304 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2305 $LFS mkdir -i 0 $DIR/$tdir/a2
2306 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2307 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2308 $LFS getstripe $DIR/$tdir/a1/f1
2309 $LFS getstripe $DIR/$tdir/a2/f2
2311 if [ $MDSCOUNT -ge 2 ]; then
2312 $LFS mkdir -i 1 $DIR/$tdir/a3
2313 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2314 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2315 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2316 $LFS mkdir -i 1 $DIR/$tdir/a4
2317 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2318 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2319 $LFS getstripe $DIR/$tdir/a3/f3
2320 $LFS getstripe $DIR/$tdir/a4/f4
2323 cancel_lru_locks osc
2325 echo "Inject failure, to simulate the case of missing the MDT-object"
2326 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2327 do_facet mds1 $LCTL set_param fail_loc=0x1616
2328 rm -f $DIR/$tdir/a1/f1
2329 rm -f $DIR/$tdir/a2/f2
2331 if [ $MDSCOUNT -ge 2 ]; then
2332 do_facet mds2 $LCTL set_param fail_loc=0x1616
2333 rm -f $DIR/$tdir/a3/f3
2334 rm -f $DIR/$tdir/a4/f4
2340 do_facet mds1 $LCTL set_param fail_loc=0
2341 if [ $MDSCOUNT -ge 2 ]; then
2342 do_facet mds2 $LCTL set_param fail_loc=0
2345 cancel_lru_locks mdc
2346 cancel_lru_locks osc
2348 echo "Inject failure, to simulate the OST0 fail to handle"
2349 echo "MDT0 LFSCK request during the first-stage scanning."
2350 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2351 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2353 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2354 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2356 for k in $(seq $MDSCOUNT); do
2357 # The LFSCK status query internal is 30 seconds. For the case
2358 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2359 # time to guarantee the status sync up.
2360 wait_update_facet mds${k} "$LCTL get_param -n \
2361 mdd.$(facet_svc mds${k}).lfsck_layout |
2362 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2363 error "(2) MDS${k} is not the expected 'partial'"
2366 wait_update_facet ost1 "$LCTL get_param -n \
2367 obdfilter.$(facet_svc ost1).lfsck_layout |
2368 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2369 error "(3) OST1 is not the expected 'partial'"
2372 wait_update_facet ost2 "$LCTL get_param -n \
2373 obdfilter.$(facet_svc ost2).lfsck_layout |
2374 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2375 error "(4) OST2 is not the expected 'completed'"
2378 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2380 local repaired=$(do_facet mds1 $LCTL get_param -n \
2381 mdd.$(facet_svc mds1).lfsck_layout |
2382 awk '/^repaired_orphan/ { print $2 }')
2383 [ $repaired -eq 1 ] ||
2384 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2386 if [ $MDSCOUNT -ge 2 ]; then
2387 repaired=$(do_facet mds2 $LCTL get_param -n \
2388 mdd.$(facet_svc mds2).lfsck_layout |
2389 awk '/^repaired_orphan/ { print $2 }')
2390 [ $repaired -eq 1 ] ||
2391 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2394 echo "Trigger layout LFSCK on all devices again to cleanup"
2395 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2397 for k in $(seq $MDSCOUNT); do
2398 # The LFSCK status query internal is 30 seconds. For the case
2399 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2400 # time to guarantee the status sync up.
2401 wait_update_facet mds${k} "$LCTL get_param -n \
2402 mdd.$(facet_svc mds${k}).lfsck_layout |
2403 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2404 error "(8) MDS${k} is not the expected 'completed'"
2407 for k in $(seq $OSTCOUNT); do
2408 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2409 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2410 awk '/^status/ { print $2 }')
2411 [ "$cur_status" == "completed" ] ||
2412 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2416 local repaired=$(do_facet mds1 $LCTL get_param -n \
2417 mdd.$(facet_svc mds1).lfsck_layout |
2418 awk '/^repaired_orphan/ { print $2 }')
2419 [ $repaired -eq 2 ] ||
2420 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2422 if [ $MDSCOUNT -ge 2 ]; then
2423 repaired=$(do_facet mds2 $LCTL get_param -n \
2424 mdd.$(facet_svc mds2).lfsck_layout |
2425 awk '/^repaired_orphan/ { print $2 }')
2426 [ $repaired -eq 2 ] ||
2427 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2430 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2432 $LCTL set_param debug=-cache > /dev/null
2435 check_mount_and_prep
2436 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2438 echo "foo" > $DIR/$tdir/a0
2439 echo "guard" > $DIR/$tdir/a1
2440 cancel_lru_locks osc
2442 echo "Inject failure, then client will offer wrong parent FID when read"
2443 do_facet ost1 $LCTL set_param -n \
2444 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2445 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2446 $LCTL set_param fail_loc=0x1619
2448 echo "Read RPC with wrong parent FID should be denied"
2449 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2450 $LCTL set_param fail_loc=0
2452 run_test 19a "OST-object inconsistency self detect"
2455 check_mount_and_prep
2456 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2458 echo "Inject failure stub to make the OST-object to back point to"
2459 echo "non-exist MDT-object"
2461 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2462 do_facet ost1 $LCTL set_param fail_loc=0x1611
2463 echo "foo" > $DIR/$tdir/f0
2464 cancel_lru_locks osc
2465 do_facet ost1 $LCTL set_param fail_loc=0
2467 echo "Nothing should be fixed since self detect and repair is disabled"
2468 local repaired=$(do_facet ost1 $LCTL get_param -n \
2469 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2470 awk '/^repaired/ { print $2 }')
2471 [ $repaired -eq 0 ] ||
2472 error "(1) Expected 0 repaired, but got $repaired"
2474 echo "Read RPC with right parent FID should be accepted,"
2475 echo "and cause parent FID on OST to be fixed"
2477 do_facet ost1 $LCTL set_param -n \
2478 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2479 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2481 repaired=$(do_facet ost1 $LCTL get_param -n \
2482 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2483 awk '/^repaired/ { print $2 }')
2484 [ $repaired -eq 1 ] ||
2485 error "(3) Expected 1 repaired, but got $repaired"
2487 run_test 19b "OST-object inconsistency self repair"
2490 [ $OSTCOUNT -lt 2 ] &&
2491 skip "The test needs at least 2 OSTs" && return
2494 echo "The target MDT-object and some of its OST-object are lost."
2495 echo "The LFSCK should find out the left OST-objects and re-create"
2496 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2497 echo "with the partial OST-objects (LOV EA hole)."
2499 echo "New client can access the file with LOV EA hole via normal"
2500 echo "system tools or commands without crash the system."
2502 echo "For old client, even though it cannot access the file with"
2503 echo "LOV EA hole, it should not cause the system crash."
2506 check_mount_and_prep
2507 $LFS mkdir -i 0 $DIR/$tdir/a1
2508 if [ $OSTCOUNT -gt 2 ]; then
2509 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2512 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2516 # 256 blocks on the stripe0.
2517 # 1 block on the stripe1 for 2 OSTs case.
2518 # 256 blocks on the stripe1 for other cases.
2519 # 1 block on the stripe2 if OSTs > 2
2520 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2521 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2522 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2524 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2525 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2526 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2529 $LFS getstripe $DIR/$tdir/a1/f0
2531 $LFS getstripe $DIR/$tdir/a1/f1
2533 $LFS getstripe $DIR/$tdir/a1/f2
2535 if [ $OSTCOUNT -gt 2 ]; then
2536 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2537 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2539 $LFS getstripe $DIR/$tdir/a1/f3
2542 cancel_lru_locks osc
2544 echo "Inject failure..."
2545 echo "To simulate f0 lost MDT-object"
2546 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2547 do_facet mds1 $LCTL set_param fail_loc=0x1616
2548 rm -f $DIR/$tdir/a1/f0
2550 echo "To simulate f1 lost MDT-object and OST-object0"
2551 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2552 do_facet mds1 $LCTL set_param fail_loc=0x161a
2553 rm -f $DIR/$tdir/a1/f1
2555 echo "To simulate f2 lost MDT-object and OST-object1"
2556 do_facet mds1 $LCTL set_param fail_val=1
2557 rm -f $DIR/$tdir/a1/f2
2559 if [ $OSTCOUNT -gt 2 ]; then
2560 echo "To simulate f3 lost MDT-object and OST-object2"
2561 do_facet mds1 $LCTL set_param fail_val=2
2562 rm -f $DIR/$tdir/a1/f3
2565 umount_client $MOUNT
2568 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2570 echo "Inject failure to slow down the LFSCK on OST0"
2571 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2572 do_facet ost1 $LCTL set_param fail_loc=0x161b
2574 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2575 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2578 do_facet ost1 $LCTL set_param fail_loc=0
2580 for k in $(seq $MDSCOUNT); do
2581 # The LFSCK status query internal is 30 seconds. For the case
2582 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2583 # time to guarantee the status sync up.
2584 wait_update_facet mds${k} "$LCTL get_param -n \
2585 mdd.$(facet_svc mds${k}).lfsck_layout |
2586 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2587 error "(2) MDS${k} is not the expected 'completed'"
2590 for k in $(seq $OSTCOUNT); do
2591 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2592 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2593 awk '/^status/ { print $2 }')
2594 [ "$cur_status" == "completed" ] ||
2595 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2598 local repaired=$(do_facet mds1 $LCTL get_param -n \
2599 mdd.$(facet_svc mds1).lfsck_layout |
2600 awk '/^repaired_orphan/ { print $2 }')
2601 if [ $OSTCOUNT -gt 2 ]; then
2602 [ $repaired -eq 9 ] ||
2603 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2605 [ $repaired -eq 4 ] ||
2606 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2609 mount_client $MOUNT || error "(5.0) Fail to start client!"
2611 LOV_PATTERN_F_HOLE=0x40000000
2614 # ${fid0}-R-0 is the old f0
2616 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2617 echo "Check $name, which is the old f0"
2619 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2621 local pattern=0x$($LFS getstripe -L $name)
2622 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2623 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2625 local stripes=$($LFS getstripe -c $name)
2626 if [ $OSTCOUNT -gt 2 ]; then
2627 [ $stripes -eq 3 ] ||
2628 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2630 [ $stripes -eq 2 ] ||
2631 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2634 local size=$(stat $name | awk '/Size:/ { print $2 }')
2635 [ $size -eq $((4096 * $bcount)) ] ||
2636 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2638 cat $name > /dev/null || error "(5.5) cannot read $name"
2640 echo "dummy" >> $name || error "(5.6) cannot write $name"
2642 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2644 touch $name || error "(5.8) cannot touch $name"
2646 rm -f $name || error "(5.9) cannot unlink $name"
2649 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2651 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2652 if [ $OSTCOUNT -gt 2 ]; then
2653 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2655 echo "Check $name, it contains the old f1's stripe1"
2658 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2660 pattern=0x$($LFS getstripe -L $name)
2661 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2662 error "(6.2) expect pattern flag hole, but got $pattern"
2664 stripes=$($LFS getstripe -c $name)
2665 if [ $OSTCOUNT -gt 2 ]; then
2666 [ $stripes -eq 3 ] ||
2667 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2669 [ $stripes -eq 2 ] ||
2670 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2673 size=$(stat $name | awk '/Size:/ { print $2 }')
2674 [ $size -eq $((4096 * $bcount)) ] ||
2675 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2677 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2679 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2680 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2683 [ $failures -eq 256 ] ||
2684 error "(6.6) expect 256 IO failures, but get $failures"
2686 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2687 [ $size -eq $((4096 * $bcount)) ] ||
2688 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2690 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2691 error "(6.8) write to the LOV EA hole should fail"
2693 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2694 error "(6.9) write to normal stripe should NOT fail"
2696 echo "foo" >> $name && error "(6.10) append write $name should fail"
2698 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2700 touch $name || error "(6.12) cannot touch $name"
2702 rm -f $name || error "(6.13) cannot unlink $name"
2705 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2707 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2708 if [ $OSTCOUNT -gt 2 ]; then
2709 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2711 echo "Check $name, it contains the old f2's stripe0"
2714 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2716 pattern=0x$($LFS getstripe -L $name)
2717 stripes=$($LFS getstripe -c $name)
2718 size=$(stat $name | awk '/Size:/ { print $2 }')
2719 if [ $OSTCOUNT -gt 2 ]; then
2720 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2721 error "(7.2.1) expect pattern flag hole, but got $pattern"
2723 [ $stripes -eq 3 ] ||
2724 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2726 [ $size -eq $((4096 * $bcount)) ] ||
2727 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2729 cat $name > /dev/null &&
2730 error "(7.5.1) normal read $name should fail"
2732 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2733 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2735 [ $failures -eq 256 ] ||
2736 error "(7.6) expect 256 IO failures, but get $failures"
2738 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2739 [ $size -eq $((4096 * $bcount)) ] ||
2740 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2742 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2743 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2745 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2746 error "(7.8.1) write to normal stripe should NOT fail"
2748 echo "foo" >> $name &&
2749 error "(7.8.3) append write $name should fail"
2751 chown $RUNAS_ID:$RUNAS_GID $name ||
2752 error "(7.9.1) cannot chown on $name"
2754 touch $name || error "(7.10.1) cannot touch $name"
2756 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2757 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2759 [ $stripes -eq 1 ] ||
2760 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2763 [ $size -eq $((4096 * (256 + 0))) ] ||
2764 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2766 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2768 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2770 chown $RUNAS_ID:$RUNAS_GID $name ||
2771 error "(7.9.2) cannot chown on $name"
2773 touch $name || error "(7.10.2) cannot touch $name"
2776 rm -f $name || error "(7.11) cannot unlink $name"
2778 [ $OSTCOUNT -le 2 ] && return
2781 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2783 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2784 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2786 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2788 pattern=0x$($LFS getstripe -L $name)
2789 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2790 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2792 stripes=$($LFS getstripe -c $name)
2793 # LFSCK does not know the old f3 had 3 stripes.
2794 # It only tries to find as much as possible.
2795 # The stripe count depends on the last stripe's offset.
2796 [ $stripes -eq 2 ] ||
2797 error "(8.3) expect the stripe count is 2, but got $stripes"
2799 size=$(stat $name | awk '/Size:/ { print $2 }')
2801 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2802 error "(8.4) expect the size $((4096 * 512)), but got $size"
2804 cat $name > /dev/null || error "(8.5) cannot read $name"
2806 echo "dummy" >> $name || error "(8.6) cannot write $name"
2808 chown $RUNAS_ID:$RUNAS_GID $name ||
2809 error "(8.7) cannot chown on $name"
2811 touch $name || error "(8.8) cannot touch $name"
2813 rm -f $name || error "(8.9) cannot unlink $name"
2815 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2818 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2819 skip "ignore the test if MDS is older than 2.5.59" && return
2821 check_mount_and_prep
2822 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2824 echo "Start all LFSCK components by default (-s 1)"
2825 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2826 error "Fail to start LFSCK"
2828 echo "namespace LFSCK should be in 'scanning-phase1' status"
2829 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2830 [ "$STATUS" == "scanning-phase1" ] ||
2831 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2833 echo "layout LFSCK should be in 'scanning-phase1' status"
2834 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2835 [ "$STATUS" == "scanning-phase1" ] ||
2836 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2838 echo "Stop all LFSCK components by default"
2839 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2840 error "Fail to stop LFSCK"
2842 run_test 21 "run all LFSCK components by default"
2845 [ $MDSCOUNT -lt 2 ] &&
2846 skip "We need at least 2 MDSes for this test" && return
2849 echo "The parent_A references the child directory via some name entry,"
2850 echo "but the child directory back references another parent_B via its"
2851 echo "".." name entry. The parent_B does not exist. Then the namespace"
2852 echo "LFSCK will repair the child directory's ".." name entry."
2855 check_mount_and_prep
2857 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2858 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2860 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2861 echo "The dummy's dotdot name entry references the guard."
2862 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2863 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2864 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2865 error "(3) Fail to mkdir on MDT0"
2866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2868 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2870 echo "Trigger namespace LFSCK to repair unmatched pairs"
2871 $START_NAMESPACE -A -r ||
2872 error "(5) Fail to start LFSCK for namespace"
2874 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2875 mdd.${MDT_DEV}.lfsck_namespace |
2876 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2878 error "(6) unexpected status"
2881 local repaired=$($SHOW_NAMESPACE |
2882 awk '/^unmatched_pairs_repaired/ { print $2 }')
2883 [ $repaired -eq 1 ] ||
2884 error "(7) Fail to repair unmatched pairs: $repaired"
2886 echo "'ls' should success after namespace LFSCK repairing"
2887 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2888 error "(8) ls should success."
2890 run_test 22a "LFSCK can repair unmatched pairs (1)"
2893 [ $MDSCOUNT -lt 2 ] &&
2894 skip "We need at least 2 MDSes for this test" && return
2897 echo "The parent_A references the child directory via the name entry_B,"
2898 echo "but the child directory back references another parent_C via its"
2899 echo "".." name entry. The parent_C exists, but there is no the name"
2900 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2901 echo "the child directory's ".." name entry and its linkEA."
2904 check_mount_and_prep
2906 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2907 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2909 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2910 echo "and bad linkEA. The dummy's dotdot name entry references the"
2911 echo "guard. The dummy's linkEA references n non-exist name entry."
2912 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2913 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2914 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2915 error "(3) Fail to mkdir on MDT0"
2916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2918 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2919 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2920 local dummyname=$($LFS fid2path $DIR $dummyfid)
2921 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2922 error "(4) fid2path works unexpectedly."
2924 echo "Trigger namespace LFSCK to repair unmatched pairs"
2925 $START_NAMESPACE -A -r ||
2926 error "(5) Fail to start LFSCK for namespace"
2928 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2929 mdd.${MDT_DEV}.lfsck_namespace |
2930 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2932 error "(6) unexpected status"
2935 local repaired=$($SHOW_NAMESPACE |
2936 awk '/^unmatched_pairs_repaired/ { print $2 }')
2937 [ $repaired -eq 1 ] ||
2938 error "(7) Fail to repair unmatched pairs: $repaired"
2940 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2941 local dummyname=$($LFS fid2path $DIR $dummyfid)
2942 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2943 error "(8) fid2path does not work"
2945 run_test 22b "LFSCK can repair unmatched pairs (2)"
2948 [ $MDSCOUNT -lt 2 ] &&
2949 skip "We need at least 2 MDSes for this test" && return
2952 echo "The name entry is there, but the MDT-object for such name "
2953 echo "entry does not exist. The namespace LFSCK should find out "
2954 echo "and repair the inconsistency as required."
2957 check_mount_and_prep
2959 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2960 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2962 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2963 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2964 do_facet mds2 $LCTL set_param fail_loc=0x1620
2965 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2966 do_facet mds2 $LCTL set_param fail_loc=0
2968 echo "'ls' should fail because of dangling name entry"
2969 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2971 echo "Trigger namespace LFSCK to find out dangling name entry"
2972 $START_NAMESPACE -A -r ||
2973 error "(5) Fail to start LFSCK for namespace"
2975 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2976 mdd.${MDT_DEV}.lfsck_namespace |
2977 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2979 error "(6) unexpected status"
2982 local repaired=$($SHOW_NAMESPACE |
2983 awk '/^dangling_repaired/ { print $2 }')
2984 [ $repaired -eq 1 ] ||
2985 error "(7) Fail to repair dangling name entry: $repaired"
2987 echo "'ls' should fail because not re-create MDT-object by default"
2988 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2990 echo "Trigger namespace LFSCK again to repair dangling name entry"
2991 $START_NAMESPACE -A -r -C ||
2992 error "(9) Fail to start LFSCK for namespace"
2994 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2995 mdd.${MDT_DEV}.lfsck_namespace |
2996 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2998 error "(10) unexpected status"
3001 repaired=$($SHOW_NAMESPACE |
3002 awk '/^dangling_repaired/ { print $2 }')
3003 [ $repaired -eq 1 ] ||
3004 error "(11) Fail to repair dangling name entry: $repaired"
3006 echo "'ls' should success after namespace LFSCK repairing"
3007 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3009 run_test 23a "LFSCK can repair dangling name entry (1)"
3013 echo "The objectA has multiple hard links, one of them corresponding"
3014 echo "to the name entry_B. But there is something wrong for the name"
3015 echo "entry_B and cause entry_B to references non-exist object_C."
3016 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3017 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3018 echo "comes to the second-stage scanning, it will find that the"
3019 echo "former re-creating object_C is not proper, and will try to"
3020 echo "replace the object_C with the real object_A."
3023 check_mount_and_prep
3025 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3026 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3027 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3029 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3030 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3032 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3033 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3035 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3037 echo "'ls' should fail because of dangling name entry"
3038 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3039 error "(6) ls should fail."
3041 echo "Trigger namespace LFSCK to find out dangling name entry"
3042 $START_NAMESPACE -r -C ||
3043 error "(7) Fail to start LFSCK for namespace"
3045 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3046 mdd.${MDT_DEV}.lfsck_namespace |
3047 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3049 error "(8) unexpected status"
3052 local repaired=$($SHOW_NAMESPACE |
3053 awk '/^dangling_repaired/ { print $2 }')
3054 [ $repaired -eq 1 ] ||
3055 error "(9) Fail to repair dangling name entry: $repaired"
3057 repaired=$($SHOW_NAMESPACE |
3058 awk '/^multiple_linked_repaired/ { print $2 }')
3059 [ $repaired -eq 1 ] ||
3060 error "(10) Fail to drop the former created object: $repaired"
3062 local data=$(cat $DIR/$tdir/d0/foo)
3063 [ "$data" == "dummy" ] ||
3064 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3066 run_test 23b "LFSCK can repair dangling name entry (2)"
3070 echo "The objectA has multiple hard links, one of them corresponding"
3071 echo "to the name entry_B. But there is something wrong for the name"
3072 echo "entry_B and cause entry_B to references non-exist object_C."
3073 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3074 echo "as dangling, and re-create the lost object_C. And then others"
3075 echo "modified the re-created object_C. When the LFSCK comes to the"
3076 echo "second-stage scanning, it will find that the former re-creating"
3077 echo "object_C maybe wrong and try to replace the object_C with the"
3078 echo "real object_A. But because object_C has been modified, so the"
3079 echo "LFSCK cannot replace it."
3082 check_mount_and_prep
3084 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3085 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3086 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3088 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3089 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3090 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3091 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3094 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3096 echo "'ls' should fail because of dangling name entry"
3097 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3098 error "(6) ls should fail."
3100 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3101 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3103 echo "Trigger namespace LFSCK to find out dangling name entry"
3104 $START_NAMESPACE -r -C ||
3105 error "(7) Fail to start LFSCK for namespace"
3107 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3108 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3109 stat $DIR/$tdir/guard
3111 error "(8) unexpected size"
3114 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3115 cancel_lru_locks osc
3117 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3118 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3119 mdd.${MDT_DEV}.lfsck_namespace |
3120 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3122 error "(10) unexpected status"
3125 local repaired=$($SHOW_NAMESPACE |
3126 awk '/^dangling_repaired/ { print $2 }')
3127 [ $repaired -eq 1 ] ||
3128 error "(11) Fail to repair dangling name entry: $repaired"
3130 local data=$(cat $DIR/$tdir/d0/foo)
3131 [ "$data" != "dummy" ] ||
3132 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3134 run_test 23c "LFSCK can repair dangling name entry (3)"
3137 [ $MDSCOUNT -lt 2 ] &&
3138 skip "We need at least 2 MDSes for this test" && return
3141 echo "Two MDT-objects back reference the same name entry via their"
3142 echo "each own linkEA entry, but the name entry only references one"
3143 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3144 echo "for the MDT-object that is not recognized. If such MDT-object"
3145 echo "has no other linkEA entry after the removing, then the LFSCK"
3146 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3149 check_mount_and_prep
3151 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3153 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3154 $LFS path2fid $DIR/$tdir/d0/guard
3156 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3157 $LFS path2fid $DIR/$tdir/d0/dummy
3160 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3161 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3163 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3166 touch $DIR/$tdir/d0/guard/foo ||
3167 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3169 echo "Inject failure stub on MDT0 to simulate the case that"
3170 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3171 echo "that references $DIR/$tdir/d0/guard/foo."
3172 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3173 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3174 echo "there with the same linkEA entry as another MDT-object"
3175 echo "$DIR/$tdir/d0/guard/foo has"
3177 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3179 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3180 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3181 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3182 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3183 rmdir $DIR/$tdir/d0/dummy/foo ||
3184 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3187 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3188 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3189 error "(6) stat successfully unexpectedly"
3191 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3192 $START_NAMESPACE -A -r ||
3193 error "(7) Fail to start LFSCK for namespace"
3195 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3196 mdd.${MDT_DEV}.lfsck_namespace |
3197 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3199 error "(8) unexpected status"
3202 local repaired=$($SHOW_NAMESPACE |
3203 awk '/^multiple_referenced_repaired/ { print $2 }')
3204 [ $repaired -eq 1 ] ||
3205 error "(9) Fail to repair multiple referenced name entry: $repaired"
3207 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3208 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3209 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3211 local cname="$cfid-$pfid-D-0"
3212 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3213 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3215 run_test 24 "LFSCK can repair multiple-referenced name entry"
3218 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3219 skip "Only support to inject failure on ldiskfs" && return
3222 echo "The file type in the name entry does not match the file type"
3223 echo "claimed by the referenced object. Then the LFSCK will update"
3224 echo "the file type in the name entry."
3227 check_mount_and_prep
3229 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3231 echo "Inject failure stub on MDT0 to simulate the case that"
3232 echo "the file type stored in the name entry is wrong."
3234 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3236 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3239 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3240 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3242 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3243 mdd.${MDT_DEV}.lfsck_namespace |
3244 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3246 error "(4) unexpected status"
3249 local repaired=$($SHOW_NAMESPACE |
3250 awk '/^bad_file_type_repaired/ { print $2 }')
3251 [ $repaired -eq 1 ] ||
3252 error "(5) Fail to repair bad file type in name entry: $repaired"
3254 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3256 run_test 25 "LFSCK can repair bad file type in the name entry"
3260 echo "The local name entry back referenced by the MDT-object is lost."
3261 echo "The namespace LFSCK will add the missing local name entry back"
3262 echo "to the normal namespace."
3265 check_mount_and_prep
3267 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3268 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3269 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3271 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3272 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3274 echo "Inject failure stub on MDT0 to simulate the case that"
3275 echo "foo's name entry will be removed, but the foo's object"
3276 echo "and its linkEA are kept in the system."
3278 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3279 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3280 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3281 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3283 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3285 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3286 $START_NAMESPACE -r -A ||
3287 error "(6) Fail to start LFSCK for namespace"
3289 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3290 mdd.${MDT_DEV}.lfsck_namespace |
3291 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3293 error "(7) unexpected status"
3296 local repaired=$($SHOW_NAMESPACE |
3297 awk '/^lost_dirent_repaired/ { print $2 }')
3298 [ $repaired -eq 1 ] ||
3299 error "(8) Fail to repair lost dirent: $repaired"
3301 ls -ail $DIR/$tdir/d0/foo ||
3302 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3304 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3305 [ "$foofid" == "$foofid2" ] ||
3306 error "(10) foo's FID changed: $foofid, $foofid2"
3308 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3311 [ $MDSCOUNT -lt 2 ] &&
3312 skip "We need at least 2 MDSes for this test" && return
3315 echo "The remote name entry back referenced by the MDT-object is lost."
3316 echo "The namespace LFSCK will add the missing remote name entry back"
3317 echo "to the normal namespace."
3320 check_mount_and_prep
3322 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3323 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3324 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3326 echo "Inject failure stub on MDT0 to simulate the case that"
3327 echo "foo's name entry will be removed, but the foo's object"
3328 echo "and its linkEA are kept in the system."
3330 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3332 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3335 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3337 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3338 $START_NAMESPACE -r -A ||
3339 error "(5) Fail to start LFSCK for namespace"
3341 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3342 mdd.${MDT_DEV}.lfsck_namespace |
3343 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3345 error "(6) unexpected status"
3348 local repaired=$($SHOW_NAMESPACE |
3349 awk '/^lost_dirent_repaired/ { print $2 }')
3350 [ $repaired -eq 1 ] ||
3351 error "(7) Fail to repair lost dirent: $repaired"
3353 ls -ail $DIR/$tdir/d0/foo ||
3354 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3356 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3357 [ "$foofid" == "$foofid2" ] ||
3358 error "(9) foo's FID changed: $foofid, $foofid2"
3360 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3364 echo "The local parent referenced by the MDT-object linkEA is lost."
3365 echo "The namespace LFSCK will re-create the lost parent as orphan."
3368 check_mount_and_prep
3370 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3371 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3372 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3373 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3375 echo "Inject failure stub on MDT0 to simulate the case that"
3376 echo "foo's name entry will be removed, but the foo's object"
3377 echo "and its linkEA are kept in the system. And then remove"
3378 echo "another hard link and the parent directory."
3380 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3381 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3382 rm -f $DIR/$tdir/d0/foo ||
3383 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3384 rm -f $DIR/$tdir/d0/dummy ||
3385 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3388 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3389 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3391 echo "Trigger namespace LFSCK to repair the lost parent"
3392 $START_NAMESPACE -r -A ||
3393 error "(6) Fail to start LFSCK for namespace"
3395 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3396 mdd.${MDT_DEV}.lfsck_namespace |
3397 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3399 error "(7) unexpected status"
3402 local repaired=$($SHOW_NAMESPACE |
3403 awk '/^lost_dirent_repaired/ { print $2 }')
3404 [ $repaired -eq 1 ] ||
3405 error "(8) Fail to repair lost dirent: $repaired"
3407 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3408 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3409 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3411 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3413 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3414 [ ! -z "$cname" ] ||
3415 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3417 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3420 [ $MDSCOUNT -lt 2 ] &&
3421 skip "We need at least 2 MDSes for this test" && return
3424 echo "The remote parent referenced by the MDT-object linkEA is lost."
3425 echo "The namespace LFSCK will re-create the lost parent as orphan."
3428 check_mount_and_prep
3430 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3431 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3433 $LFS path2fid $DIR/$tdir/d0
3435 echo "Inject failure stub on MDT0 to simulate the case that"
3436 echo "foo's name entry will be removed, but the foo's object"
3437 echo "and its linkEA are kept in the system. And then remove"
3438 echo "the parent directory."
3440 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3441 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3442 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3445 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3446 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3448 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3449 $START_NAMESPACE -r -A ||
3450 error "(6) Fail to start LFSCK for namespace"
3452 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3453 mdd.${MDT_DEV}.lfsck_namespace |
3454 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3456 error "(7) unexpected status"
3459 local repaired=$($SHOW_NAMESPACE |
3460 awk '/^lost_dirent_repaired/ { print $2 }')
3461 [ $repaired -eq 1 ] ||
3462 error "(8) Fail to repair lost dirent: $repaired"
3464 ls -ail $MOUNT/.lustre/lost+found/
3466 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3467 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3468 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3470 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3472 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3473 [ ! -z "$cname" ] ||
3474 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3476 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3479 [ $MDSCOUNT -lt 2 ] &&
3480 skip "The test needs at least 2 MDTs" && return
3483 echo "The target name entry is lost. The LFSCK should insert the"
3484 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3485 echo "the MDT (on which the orphan MDT-object resides) has ever"
3486 echo "failed to respond some name entry verification during the"
3487 echo "first stage-scanning, then the LFSCK should skip to handle"
3488 echo "orphan MDT-object on this MDT. But other MDTs should not"
3492 check_mount_and_prep
3493 $LFS mkdir -i 0 $DIR/$tdir/d1
3494 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3495 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3497 $LFS mkdir -i 1 $DIR/$tdir/d2
3498 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3499 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3501 echo "Inject failure stub on MDT0 to simulate the case that"
3502 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3503 echo "and its linkEA are kept in the system. And the case that"
3504 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3505 echo "and its linkEA are kept in the system."
3507 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3508 do_facet mds1 $LCTL set_param fail_loc=0x1624
3509 do_facet mds2 $LCTL set_param fail_loc=0x1624
3510 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3511 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3512 do_facet mds1 $LCTL set_param fail_loc=0
3513 do_facet mds2 $LCTL set_param fail_loc=0
3515 cancel_lru_locks mdc
3516 cancel_lru_locks osc
3518 echo "Inject failure, to simulate the MDT0 fail to handle"
3519 echo "MDT1 LFSCK request during the first-stage scanning."
3520 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3521 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3523 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3524 $START_NAMESPACE -r -A ||
3525 error "(3) Fail to start LFSCK for namespace"
3527 wait_update_facet mds1 "$LCTL get_param -n \
3528 mdd.$(facet_svc mds1).lfsck_namespace |
3529 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3530 error "(4) mds1 is not the expected 'partial'"
3533 wait_update_facet mds2 "$LCTL get_param -n \
3534 mdd.$(facet_svc mds2).lfsck_namespace |
3535 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3536 error "(5) mds2 is not the expected 'completed'"
3539 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3541 local repaired=$(do_facet mds1 $LCTL get_param -n \
3542 mdd.$(facet_svc mds1).lfsck_namespace |
3543 awk '/^lost_dirent_repaired/ { print $2 }')
3544 [ $repaired -eq 0 ] ||
3545 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3547 repaired=$(do_facet mds2 $LCTL get_param -n \
3548 mdd.$(facet_svc mds2).lfsck_namespace |
3549 awk '/^lost_dirent_repaired/ { print $2 }')
3550 [ $repaired -eq 1 ] ||
3551 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3553 echo "Trigger namespace LFSCK on all devices again to cleanup"
3554 $START_NAMESPACE -r -A ||
3555 error "(8) Fail to start LFSCK for namespace"
3557 for k in $(seq $MDSCOUNT); do
3558 # The LFSCK status query internal is 30 seconds. For the case
3559 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3560 # time to guarantee the status sync up.
3561 wait_update_facet mds${k} "$LCTL get_param -n \
3562 mdd.$(facet_svc mds${k}).lfsck_namespace |
3563 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3564 error "(9) MDS${k} is not the expected 'completed'"
3567 local repaired=$(do_facet mds1 $LCTL get_param -n \
3568 mdd.$(facet_svc mds1).lfsck_namespace |
3569 awk '/^lost_dirent_repaired/ { print $2 }')
3570 [ $repaired -eq 1 ] ||
3571 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3573 repaired=$(do_facet mds2 $LCTL get_param -n \
3574 mdd.$(facet_svc mds2).lfsck_namespace |
3575 awk '/^lost_dirent_repaired/ { print $2 }')
3576 [ $repaired -eq 0 ] ||
3577 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3579 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3583 echo "The object's nlink attribute is larger than the object's known"
3584 echo "name entries count. The LFSCK will repair the object's nlink"
3585 echo "attribute to match the known name entries count"
3588 check_mount_and_prep
3590 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3591 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3593 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3594 echo "nlink attribute is larger than its name entries count."
3596 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3597 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3598 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3599 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3602 cancel_lru_locks mdc
3603 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3604 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3606 echo "Trigger namespace LFSCK to repair the nlink count"
3607 $START_NAMESPACE -r -A ||
3608 error "(5) Fail to start LFSCK for namespace"
3610 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3611 mdd.${MDT_DEV}.lfsck_namespace |
3612 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3614 error "(6) unexpected status"
3617 local repaired=$($SHOW_NAMESPACE |
3618 awk '/^nlinks_repaired/ { print $2 }')
3619 [ $repaired -eq 1 ] ||
3620 error "(7) Fail to repair nlink count: $repaired"
3622 cancel_lru_locks mdc
3623 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3624 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3626 run_test 29a "LFSCK can repair bad nlink count (1)"
3630 echo "The object's nlink attribute is smaller than the object's known"
3631 echo "name entries count. The LFSCK will repair the object's nlink"
3632 echo "attribute to match the known name entries count"
3635 check_mount_and_prep
3637 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3638 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3640 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3641 echo "nlink attribute is smaller than its name entries count."
3643 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3644 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3645 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3646 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3647 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3649 cancel_lru_locks mdc
3650 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3651 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3653 echo "Trigger namespace LFSCK to repair the nlink count"
3654 $START_NAMESPACE -r -A ||
3655 error "(5) Fail to start LFSCK for namespace"
3657 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3658 mdd.${MDT_DEV}.lfsck_namespace |
3659 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3661 error "(6) unexpected status"
3664 local repaired=$($SHOW_NAMESPACE |
3665 awk '/^nlinks_repaired/ { print $2 }')
3666 [ $repaired -eq 1 ] ||
3667 error "(7) Fail to repair nlink count: $repaired"
3669 cancel_lru_locks mdc
3670 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3671 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3673 run_test 29b "LFSCK can repair bad nlink count (2)"
3677 echo "There are too many hard links to the object, and exceeds the"
3678 echo "object's linkEA limitation, as to NOT all the known name entries"
3679 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3680 echo "skip the nlink verification for this object."
3683 check_mount_and_prep
3685 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3686 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3687 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3688 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3690 echo "Inject failure stub on MDT0 to simulate the case that"
3691 echo "foo's hard links exceed the object's linkEA limitation."
3693 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3695 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3696 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3698 cancel_lru_locks mdc
3700 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3701 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3703 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3704 $LFS fid2path $DIR $foofid
3705 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3706 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3708 echo "Trigger namespace LFSCK to repair the nlink count"
3709 $START_NAMESPACE -r -A ||
3710 error "(7) Fail to start LFSCK for namespace"
3712 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3713 mdd.${MDT_DEV}.lfsck_namespace |
3714 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3716 error "(8) unexpected status"
3719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3720 local repaired=$($SHOW_NAMESPACE |
3721 awk '/^nlinks_repaired/ { print $2 }')
3722 [ $repaired -eq 0 ] ||
3723 error "(9) Repair nlink count unexpcetedly: $repaired"
3725 cancel_lru_locks mdc
3727 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3728 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3730 count2=$($LFS fid2path $DIR $foofid | wc -l)
3731 [ $count2 -eq 2 ] ||
3732 error "(11) Repaired something unexpectedly: $count2"
3734 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3737 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3738 skip "Only support backend /lost+found for ldiskfs" && return
3741 echo "The namespace LFSCK will move the orphans from backend"
3742 echo "/lost+found directory to normal client visible namespace"
3743 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3746 check_mount_and_prep
3748 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3749 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3751 echo "Inject failure stub on MDT0 to simulate the case that"
3752 echo "directory d0 has no linkEA entry, then the LFSCK will"
3753 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3755 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3757 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3760 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3761 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3763 echo "Inject failure stub on MDT0 to simulate the case that the"
3764 echo "object's name entry will be removed, but not destroy the"
3765 echo "object. Then backend e2fsck will handle it as orphan and"
3766 echo "add them into the backend /lost+found directory."
3768 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3769 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3770 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3771 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3772 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3773 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3774 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3776 umount_client $MOUNT || error "(10) Fail to stop client!"
3778 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3781 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3782 error "(12) Fail to run e2fsck"
3784 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3785 error "(13) Fail to start MDT0"
3787 echo "Trigger namespace LFSCK to recover backend orphans"
3788 $START_NAMESPACE -r -A ||
3789 error "(14) Fail to start LFSCK for namespace"
3791 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3792 mdd.${MDT_DEV}.lfsck_namespace |
3793 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3795 error "(15) unexpected status"
3798 local repaired=$($SHOW_NAMESPACE |
3799 awk '/^local_lost_found_moved/ { print $2 }')
3800 [ $repaired -ge 4 ] ||
3801 error "(16) Fail to recover backend orphans: $repaired"
3803 mount_client $MOUNT || error "(17) Fail to start client!"
3805 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3807 ls -ail $MOUNT/.lustre/lost+found/
3809 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3810 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3811 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3813 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3815 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3816 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3818 stat ${cname}/d1 || error "(21) d0 is not recovered"
3819 stat ${cname}/f1 || error "(22) f1 is not recovered"
3821 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3824 [ $MDSCOUNT -lt 2 ] &&
3825 skip "The test needs at least 2 MDTs" && return
3828 echo "For the name entry under a striped directory, if the name"
3829 echo "hash does not match the shard, then the LFSCK will repair"
3830 echo "the bad name entry"
3833 check_mount_and_prep
3835 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3836 error "(1) Fail to create striped directory"
3838 echo "Inject failure stub on client to simulate the case that"
3839 echo "some name entry should be inserted into other non-first"
3840 echo "shard, but inserted into the first shard by wrong"
3842 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3843 $LCTL set_param fail_loc=0x1628 fail_val=0
3844 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3845 error "(2) Fail to create file under striped directory"
3846 $LCTL set_param fail_loc=0 fail_val=0
3848 echo "Trigger namespace LFSCK to repair bad name hash"
3849 $START_NAMESPACE -r -A ||
3850 error "(3) Fail to start LFSCK for namespace"
3852 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3853 mdd.${MDT_DEV}.lfsck_namespace |
3854 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3856 error "(4) unexpected status"
3859 local repaired=$($SHOW_NAMESPACE |
3860 awk '/^name_hash_repaired/ { print $2 }')
3861 [ $repaired -ge 1 ] ||
3862 error "(5) Fail to repair bad name hash: $repaired"
3864 umount_client $MOUNT || error "(6) umount failed"
3865 mount_client $MOUNT || error "(7) mount failed"
3867 for ((i = 0; i < $MDSCOUNT; i++)); do
3868 stat $DIR/$tdir/striped_dir/d$i ||
3869 error "(8) Fail to stat d$i after LFSCK"
3870 rmdir $DIR/$tdir/striped_dir/d$i ||
3871 error "(9) Fail to unlink d$i after LFSCK"
3874 rmdir $DIR/$tdir/striped_dir ||
3875 error "(10) Fail to remove the striped directory after LFSCK"
3877 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3880 [ $MDSCOUNT -lt 2 ] &&
3881 skip "The test needs at least 2 MDTs" && return
3884 echo "For the name entry under a striped directory, if the name"
3885 echo "hash does not match the shard, then the LFSCK will repair"
3886 echo "the bad name entry"
3889 check_mount_and_prep
3891 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3892 error "(1) Fail to create striped directory"
3894 echo "Inject failure stub on client to simulate the case that"
3895 echo "some name entry should be inserted into other non-second"
3896 echo "shard, but inserted into the secod shard by wrong"
3898 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3899 $LCTL set_param fail_loc=0x1628 fail_val=1
3900 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3901 error "(2) Fail to create file under striped directory"
3902 $LCTL set_param fail_loc=0 fail_val=0
3904 echo "Trigger namespace LFSCK to repair bad name hash"
3905 $START_NAMESPACE -r -A ||
3906 error "(3) Fail to start LFSCK for namespace"
3908 wait_update_facet mds2 "$LCTL get_param -n \
3909 mdd.$(facet_svc mds2).lfsck_namespace |
3910 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3911 error "(4) unexpected status"
3913 local repaired=$(do_facet mds2 $LCTL get_param -n \
3914 mdd.$(facet_svc mds2).lfsck_namespace |
3915 awk '/^name_hash_repaired/ { print $2 }')
3916 [ $repaired -ge 1 ] ||
3917 error "(5) Fail to repair bad name hash: $repaired"
3919 umount_client $MOUNT || error "(6) umount failed"
3920 mount_client $MOUNT || error "(7) mount failed"
3922 for ((i = 0; i < $MDSCOUNT; i++)); do
3923 stat $DIR/$tdir/striped_dir/d$i ||
3924 error "(8) Fail to stat d$i after LFSCK"
3925 rmdir $DIR/$tdir/striped_dir/d$i ||
3926 error "(9) Fail to unlink d$i after LFSCK"
3929 rmdir $DIR/$tdir/striped_dir ||
3930 error "(10) Fail to remove the striped directory after LFSCK"
3932 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3935 [ $MDSCOUNT -lt 2 ] &&
3936 skip "The test needs at least 2 MDTs" && return
3939 echo "For some reason, the master MDT-object of the striped directory"
3940 echo "may lost its master LMV EA. If nobody created files under the"
3941 echo "master directly after the master LMV EA lost, then the LFSCK"
3942 echo "should re-generate the master LMV EA."
3945 check_mount_and_prep
3947 echo "Inject failure stub on MDT0 to simulate the case that the"
3948 echo "master MDT-object of the striped directory lost the LMV EA."
3950 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3951 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3952 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3953 error "(1) Fail to create striped directory"
3954 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3956 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3957 $START_NAMESPACE -r -A ||
3958 error "(2) Fail to start LFSCK for namespace"
3960 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3961 mdd.${MDT_DEV}.lfsck_namespace |
3962 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3964 error "(3) unexpected status"
3967 local repaired=$($SHOW_NAMESPACE |
3968 awk '/^striped_dirs_repaired/ { print $2 }')
3969 [ $repaired -eq 1 ] ||
3970 error "(4) Fail to re-generate master LMV EA: $repaired"
3972 umount_client $MOUNT || error "(5) umount failed"
3973 mount_client $MOUNT || error "(6) mount failed"
3975 local empty=$(ls $DIR/$tdir/striped_dir/)
3976 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3978 rmdir $DIR/$tdir/striped_dir ||
3979 error "(8) Fail to remove the striped directory after LFSCK"
3981 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3984 [ $MDSCOUNT -lt 2 ] &&
3985 skip "The test needs at least 2 MDTs" && return
3988 echo "For some reason, the master MDT-object of the striped directory"
3989 echo "may lost its master LMV EA. If somebody created files under the"
3990 echo "master directly after the master LMV EA lost, then the LFSCK"
3991 echo "should NOT re-generate the master LMV EA, instead, it should"
3992 echo "change the broken striped dirctory as read-only to prevent"
3993 echo "further damage"
3996 check_mount_and_prep
3998 echo "Inject failure stub on MDT0 to simulate the case that the"
3999 echo "master MDT-object of the striped directory lost the LMV EA."
4001 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4002 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4003 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4004 error "(1) Fail to create striped directory"
4005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4007 umount_client $MOUNT || error "(2) umount failed"
4008 mount_client $MOUNT || error "(3) mount failed"
4010 touch $DIR/$tdir/striped_dir/dummy ||
4011 error "(4) Fail to touch under broken striped directory"
4013 echo "Trigger namespace LFSCK to find out the inconsistency"
4014 $START_NAMESPACE -r -A ||
4015 error "(5) Fail to start LFSCK for namespace"
4017 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4018 mdd.${MDT_DEV}.lfsck_namespace |
4019 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4021 error "(6) unexpected status"
4024 local repaired=$($SHOW_NAMESPACE |
4025 awk '/^striped_dirs_repaired/ { print $2 }')
4026 [ $repaired -eq 0 ] ||
4027 error "(7) Re-generate master LMV EA unexpected: $repaired"
4029 stat $DIR/$tdir/striped_dir/dummy ||
4030 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4032 touch $DIR/$tdir/striped_dir/foo &&
4033 error "(9) The broken striped directory should be read-only"
4035 chattr -i $DIR/$tdir/striped_dir ||
4036 error "(10) Fail to chattr on the broken striped directory"
4038 rmdir $DIR/$tdir/striped_dir ||
4039 error "(11) Fail to remove the striped directory after LFSCK"
4041 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4044 [ $MDSCOUNT -lt 2 ] &&
4045 skip "The test needs at least 2 MDTs" && return
4048 echo "For some reason, the slave MDT-object of the striped directory"
4049 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4050 echo "slave LMV EA."
4053 check_mount_and_prep
4055 echo "Inject failure stub on MDT0 to simulate the case that the"
4056 echo "slave MDT-object (that resides on the same MDT as the master"
4057 echo "MDT-object resides on) lost the LMV EA."
4059 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4060 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4061 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4062 error "(1) Fail to create striped directory"
4063 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4065 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4066 $START_NAMESPACE -r -A ||
4067 error "(2) Fail to start LFSCK for namespace"
4069 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4070 mdd.${MDT_DEV}.lfsck_namespace |
4071 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4073 error "(3) unexpected status"
4076 local repaired=$($SHOW_NAMESPACE |
4077 awk '/^striped_shards_repaired/ { print $2 }')
4078 [ $repaired -eq 1 ] ||
4079 error "(4) Fail to re-generate slave LMV EA: $repaired"
4081 rmdir $DIR/$tdir/striped_dir ||
4082 error "(5) Fail to remove the striped directory after LFSCK"
4084 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4087 [ $MDSCOUNT -lt 2 ] &&
4088 skip "The test needs at least 2 MDTs" && return
4091 echo "For some reason, the slave MDT-object of the striped directory"
4092 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4093 echo "slave LMV EA."
4096 check_mount_and_prep
4098 echo "Inject failure stub on MDT0 to simulate the case that the"
4099 echo "slave MDT-object (that resides on differnt MDT as the master"
4100 echo "MDT-object resides on) lost the LMV EA."
4102 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4104 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4105 error "(1) Fail to create striped directory"
4106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4108 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4109 $START_NAMESPACE -r -A ||
4110 error "(2) Fail to start LFSCK for namespace"
4112 wait_update_facet mds2 "$LCTL get_param -n \
4113 mdd.$(facet_svc mds2).lfsck_namespace |
4114 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4115 error "(3) unexpected status"
4117 local repaired=$(do_facet mds2 $LCTL get_param -n \
4118 mdd.$(facet_svc mds2).lfsck_namespace |
4119 awk '/^striped_shards_repaired/ { print $2 }')
4120 [ $repaired -eq 1 ] ||
4121 error "(4) Fail to re-generate slave LMV EA: $repaired"
4123 rmdir $DIR/$tdir/striped_dir ||
4124 error "(5) Fail to remove the striped directory after LFSCK"
4126 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4129 [ $MDSCOUNT -lt 2 ] &&
4130 skip "The test needs at least 2 MDTs" && return
4133 echo "For some reason, the stripe index in the slave LMV EA is"
4134 echo "corrupted. The LFSCK should repair the slave LMV EA."
4137 check_mount_and_prep
4139 echo "Inject failure stub on MDT0 to simulate the case that the"
4140 echo "slave LMV EA on the first shard of the striped directory"
4141 echo "claims the same index as the second shard claims"
4143 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4145 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4146 error "(1) Fail to create striped directory"
4147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4149 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4150 $START_NAMESPACE -r -A ||
4151 error "(2) Fail to start LFSCK for namespace"
4153 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4154 mdd.${MDT_DEV}.lfsck_namespace |
4155 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4157 error "(3) unexpected status"
4160 local repaired=$($SHOW_NAMESPACE |
4161 awk '/^striped_shards_repaired/ { print $2 }')
4162 [ $repaired -eq 1 ] ||
4163 error "(4) Fail to repair slave LMV EA: $repaired"
4165 umount_client $MOUNT || error "(5) umount failed"
4166 mount_client $MOUNT || error "(6) mount failed"
4168 touch $DIR/$tdir/striped_dir/foo ||
4169 error "(7) Fail to touch file after the LFSCK"
4171 rm -f $DIR/$tdir/striped_dir/foo ||
4172 error "(8) Fail to unlink file after the LFSCK"
4174 rmdir $DIR/$tdir/striped_dir ||
4175 error "(9) Fail to remove the striped directory after LFSCK"
4177 run_test 31g "Repair the corrupted slave LMV EA"
4180 [ $MDSCOUNT -lt 2 ] &&
4181 skip "The test needs at least 2 MDTs" && return
4184 echo "For some reason, the shard's name entry in the striped"
4185 echo "directory may be corrupted. The LFSCK should repair the"
4186 echo "bad shard's name entry."
4189 check_mount_and_prep
4191 echo "Inject failure stub on MDT0 to simulate the case that the"
4192 echo "first shard's name entry in the striped directory claims"
4193 echo "the same index as the second shard's name entry claims."
4195 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4197 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4198 error "(1) Fail to create striped directory"
4199 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4201 echo "Trigger namespace LFSCK to repair the shard's name entry"
4202 $START_NAMESPACE -r -A ||
4203 error "(2) Fail to start LFSCK for namespace"
4205 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4206 mdd.${MDT_DEV}.lfsck_namespace |
4207 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4209 error "(3) unexpected status"
4212 local repaired=$($SHOW_NAMESPACE |
4213 awk '/^dirent_repaired/ { print $2 }')
4214 [ $repaired -eq 1 ] ||
4215 error "(4) Fail to repair shard's name entry: $repaired"
4217 umount_client $MOUNT || error "(5) umount failed"
4218 mount_client $MOUNT || error "(6) mount failed"
4220 touch $DIR/$tdir/striped_dir/foo ||
4221 error "(7) Fail to touch file after the LFSCK"
4223 rm -f $DIR/$tdir/striped_dir/foo ||
4224 error "(8) Fail to unlink file after the LFSCK"
4226 rmdir $DIR/$tdir/striped_dir ||
4227 error "(9) Fail to remove the striped directory after LFSCK"
4229 run_test 31h "Repair the corrupted shard's name entry"
4231 # restore MDS/OST size
4232 MDSSIZE=${SAVED_MDSSIZE}
4233 OSTSIZE=${SAVED_OSTSIZE}
4234 OSTCOUNT=${SAVED_OSTCOUNT}
4236 # cleanup the system at last