3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
135 #define OBD_FAIL_LFSCK_DELAY1 0x1600
136 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
137 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
139 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
141 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
142 [ "$STATUS" == "scanning-phase1" ] ||
143 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
145 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
147 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
148 [ "$STATUS" == "stopped" ] ||
149 error "(6) Expect 'stopped', but got '$STATUS'"
151 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
153 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
154 [ "$STATUS" == "scanning-phase1" ] ||
155 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
157 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
158 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
159 mdd.${MDT_DEV}.lfsck_namespace |
160 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
162 error "(9) unexpected status"
165 local repaired=$($SHOW_NAMESPACE |
166 awk '/^updated_phase1/ { print $2 }')
167 [ $repaired -eq 0 ] ||
168 error "(10) Expect nothing to be repaired, but got: $repaired"
170 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
171 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
172 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
173 mdd.${MDT_DEV}.lfsck_namespace |
174 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
176 error "(12) unexpected status"
179 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
180 [ $((scanned1 + 1)) -eq $scanned2 ] ||
181 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
183 echo "stopall, should NOT crash LU-3649"
184 stopall || error "(14) Fail to stopall"
186 run_test 0 "Control LFSCK manually"
189 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
190 skip "OI Scrub not implemented for ZFS" && return
194 #define OBD_FAIL_FID_INDIR 0x1501
195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
196 touch $DIR/$tdir/dummy
198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
200 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
201 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
202 mdd.${MDT_DEV}.lfsck_namespace |
203 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
205 error "(4) unexpected status"
208 local repaired=$($SHOW_NAMESPACE |
209 awk '/^dirent_repaired/ { print $2 }')
210 # for interop with old server
211 [ -z "$repaired" ] &&
212 repaired=$($SHOW_NAMESPACE |
213 awk '/^updated_phase1/ { print $2 }')
215 [ $repaired -eq 1 ] ||
216 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
220 mount_client $MOUNT || error "(6) Fail to start client!"
222 #define OBD_FAIL_FID_LOOKUP 0x1505
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
224 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
226 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
228 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
232 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
233 skip "OI Scrub not implemented for ZFS" && return
237 #define OBD_FAIL_FID_INLMA 0x1502
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
239 touch $DIR/$tdir/dummy
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
243 #define OBD_FAIL_FID_NOLMA 0x1506
244 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
245 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
246 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
247 mdd.${MDT_DEV}.lfsck_namespace |
248 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
250 error "(4) unexpected status"
253 local repaired=$($SHOW_NAMESPACE |
254 awk '/^dirent_repaired/ { print $2 }')
255 # for interop with old server
256 [ -z "$repaired" ] &&
257 repaired=$($SHOW_NAMESPACE |
258 awk '/^updated_phase1/ { print $2 }')
260 [ $repaired -eq 1 ] ||
261 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
266 mount_client $MOUNT || error "(6) Fail to start client!"
268 #define OBD_FAIL_FID_LOOKUP 0x1505
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
270 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
272 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
274 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
279 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
280 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
281 touch $DIR/$tdir/dummy
283 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
285 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
286 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
287 mdd.${MDT_DEV}.lfsck_namespace |
288 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
290 error "(4) unexpected status"
293 local repaired=$($SHOW_NAMESPACE |
294 awk '/^linkea_repaired/ { print $2 }')
295 # for interop with old server
296 [ -z "$repaired" ] &&
297 repaired=$($SHOW_NAMESPACE |
298 awk '/^updated_phase2/ { print $2 }')
300 [ $repaired -eq 1 ] ||
301 error "(5) Fail to repair crashed linkEA: $repaired"
305 mount_client $MOUNT || error "(6) Fail to start client!"
307 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
308 error "(7) Fail to stat $DIR/$tdir/dummy"
310 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
311 local dummyname=$($LFS fid2path $DIR $dummyfid)
312 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
313 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
315 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
321 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
322 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
323 touch $DIR/$tdir/dummy
325 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
327 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
328 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
329 mdd.${MDT_DEV}.lfsck_namespace |
330 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
332 error "(4) unexpected status"
335 local repaired=$($SHOW_NAMESPACE |
336 awk '/^updated_phase2/ { print $2 }')
337 [ $repaired -eq 1 ] ||
338 error "(5) Fail to repair crashed linkEA: $repaired"
342 mount_client $MOUNT || error "(6) Fail to start client!"
344 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
345 error "(7) Fail to stat $DIR/$tdir/dummy"
347 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
348 local dummyname=$($LFS fid2path $DIR $dummyfid)
349 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
350 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
352 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
358 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
359 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
360 touch $DIR/$tdir/dummy
362 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
364 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
365 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
366 mdd.${MDT_DEV}.lfsck_namespace |
367 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
369 error "(4) unexpected status"
372 local repaired=$($SHOW_NAMESPACE |
373 awk '/^updated_phase2/ { print $2 }')
374 [ $repaired -eq 1 ] ||
375 error "(5) Fail to repair crashed linkEA: $repaired"
379 mount_client $MOUNT || error "(6) Fail to start client!"
381 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
382 error "(7) Fail to stat $DIR/$tdir/dummy"
384 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
385 local dummyname=$($LFS fid2path $DIR $dummyfid)
386 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
387 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
389 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
395 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
396 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
397 touch $DIR/$tdir/dummy
399 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
401 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
402 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
403 mdd.${MDT_DEV}.lfsck_namespace |
404 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
406 error "(4) unexpected status"
409 local repaired=$($SHOW_NAMESPACE |
410 awk '/^linkea_repaired/ { print $2 }')
411 [ $repaired -eq 1 ] ||
412 error "(5) Fail to repair crashed linkEA: $repaired"
416 mount_client $MOUNT || error "(6) Fail to start client!"
418 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
419 error "(7) Fail to stat $DIR/$tdir/dummy"
421 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
422 local dummyname=$($LFS fid2path $DIR $dummyfid)
423 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
424 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
426 run_test 2d "LFSCK can recover the missing linkEA entry"
430 [ $MDSCOUNT -lt 2 ] &&
431 skip "We need at least 2 MDSes for this test" && return
435 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
437 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
439 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
442 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
443 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
444 mdd.${MDT_DEV}.lfsck_namespace |
445 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
447 error "(4) unexpected status"
450 local repaired=$($SHOW_NAMESPACE |
451 awk '/^linkea_repaired/ { print $2 }')
452 [ $repaired -eq 1 ] ||
453 error "(5) Fail to repair crashed linkEA: $repaired"
455 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
456 local name=$($LFS fid2path $DIR $fid)
457 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
458 error "(6) Fail to repair linkEA: $fid $name"
460 run_test 2e "namespace LFSCK can verify remote object linkEA"
466 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
467 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
468 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
470 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
471 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
472 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
474 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
475 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
476 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
478 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
479 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
480 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
482 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
484 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
485 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
486 mdd.${MDT_DEV}.lfsck_namespace |
487 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
489 error "(10) unexpected status"
492 local checked=$($SHOW_NAMESPACE |
493 awk '/^checked_phase2/ { print $2 }')
494 [ $checked -ge 4 ] ||
495 error "(11) Fail to check multiple-linked object: $checked"
497 local repaired=$($SHOW_NAMESPACE |
498 awk '/^multiple_linked_repaired/ { print $2 }')
499 [ $repaired -ge 2 ] ||
500 error "(12) Fail to repair multiple-linked object: $repaired"
502 run_test 3 "LFSCK can verify multiple-linked objects"
506 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
507 skip "OI Scrub not implemented for ZFS" && return
510 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
511 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
513 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
514 echo "start $SINGLEMDS with disabling OI scrub"
515 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
516 error "(2) Fail to start MDS!"
518 #define OBD_FAIL_LFSCK_DELAY2 0x1601
519 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
520 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
521 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
522 mdd.${MDT_DEV}.lfsck_namespace |
523 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
525 error "(5) unexpected status"
528 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
529 [ "$STATUS" == "scanning-phase1" ] ||
530 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
532 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
533 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
534 mdd.${MDT_DEV}.lfsck_namespace |
535 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
537 error "(7) unexpected status"
540 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
541 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
543 local repaired=$($SHOW_NAMESPACE |
544 awk '/^dirent_repaired/ { print $2 }')
545 # for interop with old server
546 [ -z "$repaired" ] &&
547 repaired=$($SHOW_NAMESPACE |
548 awk '/^updated_phase1/ { print $2 }')
550 [ $repaired -ge 9 ] ||
551 error "(9) Fail to re-generate FID-in-dirent: $repaired"
555 mount_client $MOUNT || error "(10) Fail to start client!"
557 #define OBD_FAIL_FID_LOOKUP 0x1505
558 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
559 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
560 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
562 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
566 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
567 skip "OI Scrub not implemented for ZFS" && return
570 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
571 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
573 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
574 echo "start $SINGLEMDS with disabling OI scrub"
575 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
576 error "(2) Fail to start MDS!"
578 #define OBD_FAIL_LFSCK_DELAY2 0x1601
579 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
580 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
581 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
582 mdd.${MDT_DEV}.lfsck_namespace |
583 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
585 error "(5) unexpected status"
588 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
589 [ "$STATUS" == "scanning-phase1" ] ||
590 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
592 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
593 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
594 mdd.${MDT_DEV}.lfsck_namespace |
595 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
597 error "(7) unexpected status"
600 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
601 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
603 local repaired=$($SHOW_NAMESPACE |
604 awk '/^dirent_repaired/ { print $2 }')
605 # for interop with old server
606 [ -z "$repaired" ] &&
607 repaired=$($SHOW_NAMESPACE |
608 awk '/^updated_phase1/ { print $2 }')
610 [ $repaired -ge 2 ] ||
611 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
615 mount_client $MOUNT || error "(10) Fail to start client!"
617 #define OBD_FAIL_FID_LOOKUP 0x1505
618 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
619 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
621 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
623 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
624 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
625 local dummyname=$($LFS fid2path $DIR $dummyfid)
626 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
627 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
629 run_test 5 "LFSCK can handle IGIF object upgrading"
634 #define OBD_FAIL_LFSCK_DELAY1 0x1600
635 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
636 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
638 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
639 [ "$STATUS" == "scanning-phase1" ] ||
640 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
642 # Sleep 3 sec to guarantee at least one object processed by LFSCK
644 # Fail the LFSCK to guarantee there is at least one checkpoint
645 #define OBD_FAIL_LFSCK_FATAL1 0x1608
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
647 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
648 mdd.${MDT_DEV}.lfsck_namespace |
649 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
651 error "(4) unexpected status"
654 local POS0=$($SHOW_NAMESPACE |
655 awk '/^last_checkpoint_position/ { print $2 }' |
658 #define OBD_FAIL_LFSCK_DELAY1 0x1600
659 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
660 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
662 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
663 [ "$STATUS" == "scanning-phase1" ] ||
664 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
666 local POS1=$($SHOW_NAMESPACE |
667 awk '/^latest_start_position/ { print $2 }' |
669 [[ $POS0 -lt $POS1 ]] ||
670 error "(7) Expect larger than: $POS0, but got $POS1"
672 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
673 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
674 mdd.${MDT_DEV}.lfsck_namespace |
675 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
677 error "(8) unexpected status"
680 run_test 6a "LFSCK resumes from last checkpoint (1)"
685 #define OBD_FAIL_LFSCK_DELAY2 0x1601
686 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
687 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
689 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
690 [ "$STATUS" == "scanning-phase1" ] ||
691 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
693 # Sleep 5 sec to guarantee that we are in the directory scanning
695 # Fail the LFSCK to guarantee there is at least one checkpoint
696 #define OBD_FAIL_LFSCK_FATAL2 0x1609
697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
698 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
699 mdd.${MDT_DEV}.lfsck_namespace |
700 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
702 error "(4) unexpected status"
705 local O_POS0=$($SHOW_NAMESPACE |
706 awk '/^last_checkpoint_position/ { print $2 }' |
709 local D_POS0=$($SHOW_NAMESPACE |
710 awk '/^last_checkpoint_position/ { print $4 }')
712 #define OBD_FAIL_LFSCK_DELAY2 0x1601
713 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
714 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
716 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
717 [ "$STATUS" == "scanning-phase1" ] ||
718 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
720 local O_POS1=$($SHOW_NAMESPACE |
721 awk '/^latest_start_position/ { print $2 }' |
723 local D_POS1=$($SHOW_NAMESPACE |
724 awk '/^latest_start_position/ { print $4 }')
726 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
727 [[ $O_POS0 -lt $O_POS1 ]] ||
728 error "(7.1) $O_POS1 is not larger than $O_POS0"
730 [[ $D_POS0 -lt $D_POS1 ]] ||
731 error "(7.2) $D_POS1 is not larger than $D_POS0"
734 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
735 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
736 mdd.${MDT_DEV}.lfsck_namespace |
737 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
739 error "(8) unexpected status"
742 run_test 6b "LFSCK resumes from last checkpoint (2)"
749 #define OBD_FAIL_LFSCK_DELAY2 0x1601
750 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
751 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
753 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
754 [ "$STATUS" == "scanning-phase1" ] ||
755 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
757 # Sleep 3 sec to guarantee at least one object processed by LFSCK
759 echo "stop $SINGLEMDS"
760 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
762 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
763 echo "start $SINGLEMDS"
764 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
765 error "(5) Fail to start MDS!"
767 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
768 mdd.${MDT_DEV}.lfsck_namespace |
769 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
771 error "(6) unexpected status"
774 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
780 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
781 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
782 for ((i = 0; i < 20; i++)); do
783 touch $DIR/$tdir/dummy${i}
786 #define OBD_FAIL_LFSCK_DELAY3 0x1602
787 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
788 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
789 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
790 mdd.${MDT_DEV}.lfsck_namespace |
791 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
793 error "(4) unexpected status"
797 echo "stop $SINGLEMDS"
798 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
800 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
801 echo "start $SINGLEMDS"
802 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
803 error "(6) Fail to start MDS!"
805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
806 mdd.${MDT_DEV}.lfsck_namespace |
807 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
809 error "(7) unexpected status"
812 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
817 formatall > /dev/null
823 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
824 [ "$STATUS" == "init" ] ||
825 error "(2) Expect 'init', but got '$STATUS'"
827 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
828 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
829 mkdir $DIR/$tdir/crashed
831 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
832 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
833 for ((i = 0; i < 5; i++)); do
834 touch $DIR/$tdir/dummy${i}
837 umount_client $MOUNT || error "(3) Fail to stop client!"
839 #define OBD_FAIL_LFSCK_DELAY2 0x1601
840 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
841 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
843 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
844 [ "$STATUS" == "scanning-phase1" ] ||
845 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
847 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
849 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
850 [ "$STATUS" == "stopped" ] ||
851 error "(7) Expect 'stopped', but got '$STATUS'"
853 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
855 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
856 [ "$STATUS" == "scanning-phase1" ] ||
857 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
859 #define OBD_FAIL_LFSCK_FATAL2 0x1609
860 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
861 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
862 mdd.${MDT_DEV}.lfsck_namespace |
863 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
865 error "(10) unexpected status"
868 #define OBD_FAIL_LFSCK_DELAY1 0x1600
869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
870 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "scanning-phase1" ] ||
874 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
876 #define OBD_FAIL_LFSCK_CRASH 0x160a
877 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
880 echo "stop $SINGLEMDS"
881 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
883 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
884 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
886 echo "start $SINGLEMDS"
887 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
888 error "(14) Fail to start MDS!"
890 local timeout=$(max_recovery_time)
893 while [ $timer -lt $timeout ]; do
894 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
895 mdt.${MDT_DEV}.recovery_status |
896 awk '/^status/ { print \\\$2 }'")
897 [ "$STATUS" != "RECOVERING" ] && break;
902 [ $timer != $timeout ] ||
903 error "(14.1) recovery timeout"
905 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
906 [ "$STATUS" == "crashed" ] ||
907 error "(15) Expect 'crashed', but got '$STATUS'"
909 #define OBD_FAIL_LFSCK_DELAY2 0x1601
910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
911 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
913 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
914 [ "$STATUS" == "scanning-phase1" ] ||
915 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
917 echo "stop $SINGLEMDS"
918 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
920 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
923 echo "start $SINGLEMDS"
924 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
925 error "(19) Fail to start MDS!"
928 while [ $timer -lt $timeout ]; do
929 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
930 mdt.${MDT_DEV}.recovery_status |
931 awk '/^status/ { print \\\$2 }'")
932 [ "$STATUS" != "RECOVERING" ] && break;
937 [ $timer != $timeout ] ||
938 error "(19.1) recovery timeout"
940 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
941 [ "$STATUS" == "paused" ] ||
942 error "(20) Expect 'paused', but got '$STATUS'"
944 echo "stop $SINGLEMDS"
945 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
947 echo "start $SINGLEMDS without resume LFSCK"
948 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
949 error "(20.2) Fail to start MDS!"
952 while [ $timer -lt $timeout ]; do
953 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
954 mdt.${MDT_DEV}.recovery_status |
955 awk '/^status/ { print \\\$2 }'")
956 [ "$STATUS" != "RECOVERING" ] && break;
961 [ $timer != $timeout ] ||
962 error "(20.3) recovery timeout"
964 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
965 [ "$STATUS" == "paused" ] ||
966 error "(20.4) Expect 'paused', but got '$STATUS'"
968 #define OBD_FAIL_LFSCK_DELAY3 0x1602
969 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
971 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
972 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
973 mdd.${MDT_DEV}.lfsck_namespace |
974 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
976 error "(22) unexpected status"
979 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
980 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
981 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
984 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
985 mdd.${MDT_DEV}.lfsck_namespace |
986 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
988 error "(24) unexpected status"
991 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
992 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
994 run_test 8 "LFSCK state machine"
997 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
998 skip "Testing on UP system, the speed may be inaccurate."
1002 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1003 { skip "Need MDS version >= 2.7.50"; return; }
1005 check_mount_and_prep
1006 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1007 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1008 createmany -o $DIR/$tdir/lfsck/f 5000
1010 local BASE_SPEED1=100
1012 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1015 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1016 [ "$STATUS" == "scanning-phase1" ] ||
1017 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1019 local SPEED=$($SHOW_LAYOUT |
1020 awk '/^average_speed_phase1/ { print $2 }')
1022 # There may be time error, normally it should be less than 2 seconds.
1023 # We allow another 20% schedule error.
1025 # MAX_MARGIN = 1.2 = 12 / 10
1026 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1027 RUN_TIME1 * 12 / 10))
1028 [ $SPEED -lt $MAX_SPEED ] ||
1029 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1031 # adjust speed limit
1032 local BASE_SPEED2=300
1034 do_facet $SINGLEMDS \
1035 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1038 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1039 # MIN_MARGIN = 0.8 = 8 / 10
1040 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1041 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1042 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1043 [ $SPEED -gt $MIN_SPEED ] || {
1044 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1045 error_ignore LU-5624 \
1046 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1049 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1053 # MAX_MARGIN = 1.2 = 12 / 10
1054 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1055 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1056 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1057 [ $SPEED -lt $MAX_SPEED ] ||
1058 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1060 do_facet $SINGLEMDS \
1061 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1063 wait_update_facet $SINGLEMDS \
1064 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1065 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1066 error "(7) Failed to get expected 'completed'"
1068 run_test 9a "LFSCK speed control (1)"
1071 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1072 skip "Testing on UP system, the speed may be inaccurate."
1076 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1077 { skip "Need MDS version >= 2.7.50"; return; }
1081 echo "Preparing another 50 * 50 files (with error) at $(date)."
1082 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1084 createmany -d $DIR/$tdir/d 50
1085 createmany -m $DIR/$tdir/f 50
1086 for ((i = 0; i < 50; i++)); do
1087 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1090 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1091 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1092 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1093 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1094 mdd.${MDT_DEV}.lfsck_namespace |
1095 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1097 error "(5) unexpected status"
1100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1101 echo "Prepared at $(date)."
1103 local BASE_SPEED1=50
1105 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1108 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1109 [ "$STATUS" == "scanning-phase2" ] ||
1110 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1112 local SPEED=$($SHOW_NAMESPACE |
1113 awk '/^average_speed_phase2/ { print $2 }')
1114 # There may be time error, normally it should be less than 2 seconds.
1115 # We allow another 20% schedule error.
1117 # MAX_MARGIN = 1.2 = 12 / 10
1118 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1119 RUN_TIME1 * 12 / 10))
1120 [ $SPEED -lt $MAX_SPEED ] ||
1121 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1123 # adjust speed limit
1124 local BASE_SPEED2=150
1126 do_facet $SINGLEMDS \
1127 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1130 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1131 # MIN_MARGIN = 0.8 = 8 / 10
1132 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1133 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1134 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1135 [ $SPEED -gt $MIN_SPEED ] || {
1136 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1137 error_ignore LU-5624 \
1138 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1141 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1145 # MAX_MARGIN = 1.2 = 12 / 10
1146 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1147 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1148 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1149 [ $SPEED -lt $MAX_SPEED ] ||
1150 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1152 do_facet $SINGLEMDS \
1153 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1154 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1155 mdd.${MDT_DEV}.lfsck_namespace |
1156 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1158 error "(11) unexpected status"
1161 run_test 9b "LFSCK speed control (2)"
1165 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1166 skip "lookup(..)/linkea on ZFS issue" && return
1170 echo "Preparing more files with error at $(date)."
1171 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1174 for ((i = 0; i < 1000; i = $((i+2)))); do
1175 mkdir -p $DIR/$tdir/d${i}
1176 touch $DIR/$tdir/f${i}
1177 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1180 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1181 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1183 for ((i = 1; i < 1000; i = $((i+2)))); do
1184 mkdir -p $DIR/$tdir/d${i}
1185 touch $DIR/$tdir/f${i}
1186 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1189 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1190 echo "Prepared at $(date)."
1192 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1194 umount_client $MOUNT
1195 mount_client $MOUNT || error "(3) Fail to start client!"
1197 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1200 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1201 [ "$STATUS" == "scanning-phase1" ] ||
1202 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1204 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1206 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1208 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1210 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1212 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1214 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1216 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1218 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1219 error "(14) Fail to softlink!"
1221 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1222 [ "$STATUS" == "scanning-phase1" ] ||
1223 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1225 do_facet $SINGLEMDS \
1226 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1227 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1228 mdd.${MDT_DEV}.lfsck_namespace |
1229 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1231 error "(16) unexpected status"
1234 run_test 10 "System is available during LFSCK scanning"
1237 ost_remove_lastid() {
1240 local rcmd="do_facet ost${ost}"
1242 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1244 # step 1: local mount
1245 mount_fstype ost${ost} || return 1
1246 # step 2: remove the specified LAST_ID
1247 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1249 unmount_fstype ost${ost} || return 2
1253 check_mount_and_prep
1254 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1255 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1260 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1262 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1263 error "(2) Fail to start ost1"
1265 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1266 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1268 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1269 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1271 wait_update_facet ost1 "$LCTL get_param -n \
1272 obdfilter.${OST_DEV}.lfsck_layout |
1273 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1275 error "(5) unexpected status"
1278 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1280 wait_update_facet ost1 "$LCTL get_param -n \
1281 obdfilter.${OST_DEV}.lfsck_layout |
1282 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1284 error "(6) unexpected status"
1287 echo "the LAST_ID(s) should have been rebuilt"
1288 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1289 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1291 run_test 11a "LFSCK can rebuild lost last_id"
1294 check_mount_and_prep
1295 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1297 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1298 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1299 do_facet ost1 $LCTL set_param fail_loc=0x160d
1301 local count=$(precreated_ost_obj_count 0 0)
1303 createmany -o $DIR/$tdir/f $((count + 32))
1305 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1306 local seq=$(do_facet mds1 $LCTL get_param -n \
1307 osp.${proc_path}.prealloc_last_seq)
1308 local lastid1=$(do_facet ost1 "lctl get_param -n \
1309 obdfilter.${ost1_svc}.last_id" | grep $seq |
1310 awk -F: '{ print $2 }')
1312 umount_client $MOUNT
1313 stop ost1 || error "(1) Fail to stop ost1"
1315 #define OBD_FAIL_OST_ENOSPC 0x215
1316 do_facet ost1 $LCTL set_param fail_loc=0x215
1318 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1319 error "(2) Fail to start ost1"
1321 for ((i = 0; i < 60; i++)); do
1322 lastid2=$(do_facet ost1 "lctl get_param -n \
1323 obdfilter.${ost1_svc}.last_id" | grep $seq |
1324 awk -F: '{ print $2 }')
1325 [ ! -z $lastid2 ] && break;
1329 echo "the on-disk LAST_ID should be smaller than the expected one"
1330 [ $lastid1 -gt $lastid2 ] ||
1331 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1333 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1334 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1336 wait_update_facet ost1 "$LCTL get_param -n \
1337 obdfilter.${OST_DEV}.lfsck_layout |
1338 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1340 error "(6) unexpected status"
1343 stop ost1 || error "(7) Fail to stop ost1"
1345 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1346 error "(8) Fail to start ost1"
1348 echo "the on-disk LAST_ID should have been rebuilt"
1349 wait_update_facet ost1 "$LCTL get_param -n \
1350 obdfilter.${ost1_svc}.last_id | grep $seq |
1351 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1352 do_facet ost1 $LCTL get_param -n \
1353 obdfilter.${ost1_svc}.last_id
1354 error "(9) expect lastid1 $seq:$lastid1"
1357 do_facet ost1 $LCTL set_param fail_loc=0
1358 stopall || error "(10) Fail to stopall"
1360 run_test 11b "LFSCK can rebuild crashed last_id"
1363 [ $MDSCOUNT -lt 2 ] &&
1364 skip "We need at least 2 MDSes for test_12" && return
1366 check_mount_and_prep
1367 for k in $(seq $MDSCOUNT); do
1368 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1369 createmany -o $DIR/$tdir/${k}/f 100 ||
1370 error "(0) Fail to create 100 files."
1373 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1374 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1375 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1377 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1378 for k in $(seq $MDSCOUNT); do
1379 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1380 mdd.$(facet_svc mds${k}).lfsck_namespace |
1381 awk '/^status/ { print $2 }')
1382 [ "$STATUS" == "scanning-phase1" ] ||
1383 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1386 echo "Stop namespace LFSCK on all targets by single lctl command."
1387 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1388 error "(4) Fail to stop LFSCK on all devices!"
1390 echo "All the LFSCK targets should be in 'stopped' status."
1391 for k in $(seq $MDSCOUNT); do
1392 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1393 mdd.$(facet_svc mds${k}).lfsck_namespace |
1394 awk '/^status/ { print $2 }')
1395 [ "$STATUS" == "stopped" ] ||
1396 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1399 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1400 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1401 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1403 echo "All the LFSCK targets should be in 'completed' status."
1404 for k in $(seq $MDSCOUNT); do
1405 wait_update_facet mds${k} "$LCTL get_param -n \
1406 mdd.$(facet_svc mds${k}).lfsck_namespace |
1407 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1408 error "(7) MDS${k} is not the expected 'completed'"
1411 start_full_debug_logging
1413 echo "Start layout LFSCK on all targets by single command (-s 1)."
1414 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1415 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1417 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1418 for k in $(seq $MDSCOUNT); do
1419 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1420 mdd.$(facet_svc mds${k}).lfsck_layout |
1421 awk '/^status/ { print $2 }')
1422 [ "$STATUS" == "scanning-phase1" ] ||
1423 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1426 echo "Stop layout LFSCK on all targets by single lctl command."
1427 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1428 error "(10) Fail to stop LFSCK on all devices!"
1430 echo "All the LFSCK targets should be in 'stopped' status."
1431 for k in $(seq $MDSCOUNT); do
1432 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1433 mdd.$(facet_svc mds${k}).lfsck_layout |
1434 awk '/^status/ { print $2 }')
1435 [ "$STATUS" == "stopped" ] ||
1436 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1439 for k in $(seq $OSTCOUNT); do
1440 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1441 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1442 awk '/^status/ { print $2 }')
1443 [ "$STATUS" == "stopped" ] ||
1444 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1447 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1449 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'completed' status."
1452 for k in $(seq $MDSCOUNT); do
1453 # The LFSCK status query internal is 30 seconds. For the case
1454 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1455 # time to guarantee the status sync up.
1456 wait_update_facet mds${k} "$LCTL get_param -n \
1457 mdd.$(facet_svc mds${k}).lfsck_layout |
1458 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1459 error "(14) MDS${k} is not the expected 'completed'"
1462 stop_full_debug_logging
1464 run_test 12 "single command to trigger LFSCK on all devices"
1468 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1469 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1470 echo "MDT-object FID."
1473 check_mount_and_prep
1475 echo "Inject failure stub to simulate bad lmm_oi"
1476 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1478 createmany -o $DIR/$tdir/f 32
1479 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1481 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1482 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1484 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1485 mdd.${MDT_DEV}.lfsck_layout |
1486 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1488 error "(2) unexpected status"
1491 local repaired=$($SHOW_LAYOUT |
1492 awk '/^repaired_others/ { print $2 }')
1493 [ $repaired -eq 32 ] ||
1494 error "(3) Fail to repair crashed lmm_oi: $repaired"
1496 run_test 13 "LFSCK can repair crashed lmm_oi"
1500 echo "The OST-object referenced by the MDT-object should be there;"
1501 echo "otherwise, the LFSCK should re-create the missing OST-object."
1504 check_mount_and_prep
1505 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1507 echo "Inject failure stub to simulate dangling referenced MDT-object"
1508 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1509 do_facet ost1 $LCTL set_param fail_loc=0x1610
1510 local count=$(precreated_ost_obj_count 0 0)
1512 createmany -o $DIR/$tdir/f $((count + 31))
1513 touch $DIR/$tdir/guard
1514 do_facet ost1 $LCTL set_param fail_loc=0
1516 start_full_debug_logging
1518 # exhaust other pre-created dangling cases
1519 count=$(precreated_ost_obj_count 0 0)
1520 createmany -o $DIR/$tdir/a $count ||
1521 error "(0) Fail to create $count files."
1523 echo "'ls' should fail because of dangling referenced MDT-object"
1524 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1526 echo "Trigger layout LFSCK to find out dangling reference"
1527 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1529 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1530 mdd.${MDT_DEV}.lfsck_layout |
1531 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1533 error "(3) unexpected status"
1536 local repaired=$($SHOW_LAYOUT |
1537 awk '/^repaired_dangling/ { print $2 }')
1538 [ $repaired -ge 32 ] ||
1539 error "(4) Fail to repair dangling reference: $repaired"
1541 echo "'stat' should fail because of not repair dangling by default"
1542 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1544 echo "Trigger layout LFSCK to repair dangling reference"
1545 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1548 mdd.${MDT_DEV}.lfsck_layout |
1549 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1551 error "(7) unexpected status"
1554 # There may be some async LFSCK updates in processing, wait for
1555 # a while until the target reparation has been done. LU-4970.
1557 echo "'stat' should success after layout LFSCK repairing"
1558 wait_update_facet client "stat $DIR/$tdir/guard |
1559 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1560 stat $DIR/$tdir/guard
1562 error "(8) unexpected size"
1565 repaired=$($SHOW_LAYOUT |
1566 awk '/^repaired_dangling/ { print $2 }')
1567 [ $repaired -ge 32 ] ||
1568 error "(9) Fail to repair dangling reference: $repaired"
1570 stop_full_debug_logging
1572 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1576 echo "If the OST-object referenced by the MDT-object back points"
1577 echo "to some non-exist MDT-object, then the LFSCK should repair"
1578 echo "the OST-object to back point to the right MDT-object."
1581 check_mount_and_prep
1582 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1584 echo "Inject failure stub to make the OST-object to back point to"
1585 echo "non-exist MDT-object."
1586 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1588 do_facet ost1 $LCTL set_param fail_loc=0x1611
1589 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1590 cancel_lru_locks osc
1591 do_facet ost1 $LCTL set_param fail_loc=0
1593 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1594 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1596 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1597 mdd.${MDT_DEV}.lfsck_layout |
1598 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1600 error "(2) unexpected status"
1603 local repaired=$($SHOW_LAYOUT |
1604 awk '/^repaired_unmatched_pair/ { print $2 }')
1605 [ $repaired -eq 1 ] ||
1606 error "(3) Fail to repair unmatched pair: $repaired"
1608 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1612 echo "If the OST-object referenced by the MDT-object back points"
1613 echo "to other MDT-object that doesn't recognize the OST-object,"
1614 echo "then the LFSCK should repair it to back point to the right"
1615 echo "MDT-object (the first one)."
1618 check_mount_and_prep
1619 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1620 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1621 cancel_lru_locks osc
1623 echo "Inject failure stub to make the OST-object to back point to"
1624 echo "other MDT-object"
1626 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1627 do_facet ost1 $LCTL set_param fail_loc=0x1612
1628 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1629 cancel_lru_locks osc
1630 do_facet ost1 $LCTL set_param fail_loc=0
1632 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1633 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1635 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1636 mdd.${MDT_DEV}.lfsck_layout |
1637 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1639 error "(2) unexpected status"
1642 local repaired=$($SHOW_LAYOUT |
1643 awk '/^repaired_unmatched_pair/ { print $2 }')
1644 [ $repaired -eq 1 ] ||
1645 error "(3) Fail to repair unmatched pair: $repaired"
1647 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1650 [ $MDSCOUNT -lt 2 ] &&
1651 skip "We need at least 2 MDSes for this test" && return
1653 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1654 skip "Skip the test after 2.7.55 see LU-6437" && return
1657 echo "According to current metadata migration implementation,"
1658 echo "before the old MDT-object is removed, both the new MDT-object"
1659 echo "and old MDT-object will reference the same LOV layout. Then if"
1660 echo "the layout LFSCK finds the new MDT-object by race, it will"
1661 echo "regard related OST-object(s) as multiple referenced case, and"
1662 echo "will try to create new OST-object(s) for the new MDT-object."
1663 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1664 echo "MDT-object before confirm the multiple referenced case."
1667 check_mount_and_prep
1668 $LFS mkdir -i 1 $DIR/$tdir/a1
1669 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1670 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1671 cancel_lru_locks osc
1673 echo "Inject failure stub on MDT1 to delay the migration"
1675 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1676 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1677 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1678 $LFS migrate -m 0 $DIR/$tdir/a1 &
1681 echo "Trigger layout LFSCK to race with the migration"
1682 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1684 for k in $(seq $MDSCOUNT); do
1685 # The LFSCK status query internal is 30 seconds. For the case
1686 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1687 # time to guarantee the status sync up.
1688 wait_update_facet mds${k} "$LCTL get_param -n \
1689 mdd.$(facet_svc mds${k}).lfsck_layout |
1690 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1691 error "(2) MDS${k} is not the expected 'completed'"
1694 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1695 local repaired=$($SHOW_LAYOUT |
1696 awk '/^repaired_unmatched_pair/ { print $2 }')
1697 [ $repaired -eq 1 ] ||
1698 error "(3) Fail to repair unmatched pair: $repaired"
1700 repaired=$($SHOW_LAYOUT |
1701 awk '/^repaired_multiple_referenced/ { print $2 }')
1702 [ $repaired -eq 0 ] ||
1703 error "(4) Unexpectedly repaird multiple references: $repaired"
1705 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1709 echo "If the OST-object's owner information does not match the owner"
1710 echo "information stored in the MDT-object, then the LFSCK trust the"
1711 echo "MDT-object and update the OST-object's owner information."
1714 check_mount_and_prep
1715 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1716 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1717 cancel_lru_locks osc
1719 echo "Inject failure stub to skip OST-object owner changing"
1720 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1721 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1722 chown 1.1 $DIR/$tdir/f0
1723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1725 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1728 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1730 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1731 mdd.${MDT_DEV}.lfsck_layout |
1732 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1734 error "(2) unexpected status"
1737 local repaired=$($SHOW_LAYOUT |
1738 awk '/^repaired_inconsistent_owner/ { print $2 }')
1739 [ $repaired -eq 1 ] ||
1740 error "(3) Fail to repair inconsistent owner: $repaired"
1742 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1746 echo "If more than one MDT-objects reference the same OST-object,"
1747 echo "and the OST-object only recognizes one MDT-object, then the"
1748 echo "LFSCK should create new OST-objects for such non-recognized"
1752 check_mount_and_prep
1753 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1755 echo "Inject failure stub to make two MDT-objects to refernce"
1756 echo "the OST-object"
1758 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1759 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1761 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1762 cancel_lru_locks osc
1764 createmany -o $DIR/$tdir/f 1
1766 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1768 cancel_lru_locks mdc
1769 cancel_lru_locks osc
1771 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1772 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1773 [ $size -eq 1048576 ] ||
1774 error "(1) f0 (wrong) size should be 1048576, but got $size"
1776 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1779 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1781 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1782 mdd.${MDT_DEV}.lfsck_layout |
1783 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1785 error "(3) unexpected status"
1788 local repaired=$($SHOW_LAYOUT |
1789 awk '/^repaired_multiple_referenced/ { print $2 }')
1790 [ $repaired -eq 1 ] ||
1791 error "(4) Fail to repair multiple references: $repaired"
1793 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1794 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1795 error "(5) Fail to write f0."
1796 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1797 [ $size -eq 1048576 ] ||
1798 error "(6) guard size should be 1048576, but got $size"
1800 run_test 17 "LFSCK can repair multiple references"
1802 $LCTL set_param debug=+cache > /dev/null
1806 echo "The target MDT-object is there, but related stripe information"
1807 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1808 echo "layout EA entries."
1811 check_mount_and_prep
1812 $LFS mkdir -i 0 $DIR/$tdir/a1
1813 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1814 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1816 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1818 $LFS path2fid $DIR/$tdir/a1/f1
1819 $LFS getstripe $DIR/$tdir/a1/f1
1821 if [ $MDSCOUNT -ge 2 ]; then
1822 $LFS mkdir -i 1 $DIR/$tdir/a2
1823 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1824 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1825 $LFS path2fid $DIR/$tdir/a2/f2
1826 $LFS getstripe $DIR/$tdir/a2/f2
1829 cancel_lru_locks osc
1831 echo "Inject failure, to make the MDT-object lost its layout EA"
1832 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1833 do_facet mds1 $LCTL set_param fail_loc=0x1615
1834 chown 1.1 $DIR/$tdir/a1/f1
1836 if [ $MDSCOUNT -ge 2 ]; then
1837 do_facet mds2 $LCTL set_param fail_loc=0x1615
1838 chown 1.1 $DIR/$tdir/a2/f2
1844 do_facet mds1 $LCTL set_param fail_loc=0
1845 if [ $MDSCOUNT -ge 2 ]; then
1846 do_facet mds2 $LCTL set_param fail_loc=0
1849 cancel_lru_locks mdc
1850 cancel_lru_locks osc
1852 echo "The file size should be incorrect since layout EA is lost"
1853 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1854 [ "$cur_size" != "$saved_size" ] ||
1855 error "(1) Expect incorrect file1 size"
1857 if [ $MDSCOUNT -ge 2 ]; then
1858 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1859 [ "$cur_size" != "$saved_size" ] ||
1860 error "(2) Expect incorrect file2 size"
1863 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1864 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1866 for k in $(seq $MDSCOUNT); do
1867 # The LFSCK status query internal is 30 seconds. For the case
1868 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1869 # time to guarantee the status sync up.
1870 wait_update_facet mds${k} "$LCTL get_param -n \
1871 mdd.$(facet_svc mds${k}).lfsck_layout |
1872 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1873 error "(4) MDS${k} is not the expected 'completed'"
1876 for k in $(seq $OSTCOUNT); do
1877 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1878 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1879 awk '/^status/ { print $2 }')
1880 [ "$cur_status" == "completed" ] ||
1881 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1884 local repaired=$(do_facet mds1 $LCTL get_param -n \
1885 mdd.$(facet_svc mds1).lfsck_layout |
1886 awk '/^repaired_orphan/ { print $2 }')
1887 [ $repaired -eq 1 ] ||
1888 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1890 if [ $MDSCOUNT -ge 2 ]; then
1891 repaired=$(do_facet mds2 $LCTL get_param -n \
1892 mdd.$(facet_svc mds2).lfsck_layout |
1893 awk '/^repaired_orphan/ { print $2 }')
1894 [ $repaired -eq 2 ] ||
1895 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1898 $LFS path2fid $DIR/$tdir/a1/f1
1899 $LFS getstripe $DIR/$tdir/a1/f1
1901 if [ $MDSCOUNT -ge 2 ]; then
1902 $LFS path2fid $DIR/$tdir/a2/f2
1903 $LFS getstripe $DIR/$tdir/a2/f2
1906 echo "The file size should be correct after layout LFSCK scanning"
1907 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1908 [ "$cur_size" == "$saved_size" ] ||
1909 error "(7) Expect file1 size $saved_size, but got $cur_size"
1911 if [ $MDSCOUNT -ge 2 ]; then
1912 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1913 [ "$cur_size" == "$saved_size" ] ||
1914 error "(8) Expect file2 size $saved_size, but got $cur_size"
1917 run_test 18a "Find out orphan OST-object and repair it (1)"
1921 echo "The target MDT-object is lost. The LFSCK should re-create the"
1922 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1923 echo "can move it back to normal namespace manually."
1926 check_mount_and_prep
1927 $LFS mkdir -i 0 $DIR/$tdir/a1
1928 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1929 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1930 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1931 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1933 $LFS getstripe $DIR/$tdir/a1/f1
1935 if [ $MDSCOUNT -ge 2 ]; then
1936 $LFS mkdir -i 1 $DIR/$tdir/a2
1937 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1938 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1939 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1941 $LFS getstripe $DIR/$tdir/a2/f2
1944 cancel_lru_locks osc
1946 echo "Inject failure, to simulate the case of missing the MDT-object"
1947 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1948 do_facet mds1 $LCTL set_param fail_loc=0x1616
1949 rm -f $DIR/$tdir/a1/f1
1951 if [ $MDSCOUNT -ge 2 ]; then
1952 do_facet mds2 $LCTL set_param fail_loc=0x1616
1953 rm -f $DIR/$tdir/a2/f2
1959 do_facet mds1 $LCTL set_param fail_loc=0
1960 if [ $MDSCOUNT -ge 2 ]; then
1961 do_facet mds2 $LCTL set_param fail_loc=0
1964 cancel_lru_locks mdc
1965 cancel_lru_locks osc
1967 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1968 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1970 for k in $(seq $MDSCOUNT); do
1971 # The LFSCK status query internal is 30 seconds. For the case
1972 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1973 # time to guarantee the status sync up.
1974 wait_update_facet mds${k} "$LCTL get_param -n \
1975 mdd.$(facet_svc mds${k}).lfsck_layout |
1976 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1977 error "(2) MDS${k} is not the expected 'completed'"
1980 for k in $(seq $OSTCOUNT); do
1981 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1982 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1983 awk '/^status/ { print $2 }')
1984 [ "$cur_status" == "completed" ] ||
1985 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1988 local repaired=$(do_facet mds1 $LCTL get_param -n \
1989 mdd.$(facet_svc mds1).lfsck_layout |
1990 awk '/^repaired_orphan/ { print $2 }')
1991 [ $repaired -eq 1 ] ||
1992 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1994 if [ $MDSCOUNT -ge 2 ]; then
1995 repaired=$(do_facet mds2 $LCTL get_param -n \
1996 mdd.$(facet_svc mds2).lfsck_layout |
1997 awk '/^repaired_orphan/ { print $2 }')
1998 [ $repaired -eq 2 ] ||
1999 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2002 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2003 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2004 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2006 if [ $MDSCOUNT -ge 2 ]; then
2007 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2008 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2011 $LFS path2fid $DIR/$tdir/a1/f1
2012 $LFS getstripe $DIR/$tdir/a1/f1
2014 if [ $MDSCOUNT -ge 2 ]; then
2015 $LFS path2fid $DIR/$tdir/a2/f2
2016 $LFS getstripe $DIR/$tdir/a2/f2
2019 echo "The file size should be correct after layout LFSCK scanning"
2020 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2021 [ "$cur_size" == "$saved_size" ] ||
2022 error "(7) Expect file1 size $saved_size, but got $cur_size"
2024 if [ $MDSCOUNT -ge 2 ]; then
2025 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2026 [ "$cur_size" == "$saved_size" ] ||
2027 error "(8) Expect file2 size $saved_size, but got $cur_size"
2030 run_test 18b "Find out orphan OST-object and repair it (2)"
2034 echo "The target MDT-object is lost, and the OST-object FID is missing."
2035 echo "The LFSCK should re-create the MDT-object with new FID under the "
2036 echo "directory .lustre/lost+found/MDTxxxx."
2039 check_mount_and_prep
2040 $LFS mkdir -i 0 $DIR/$tdir/a1
2041 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2043 echo "Inject failure, to simulate the case of missing parent FID"
2044 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2045 do_facet ost1 $LCTL set_param fail_loc=0x1617
2047 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2048 $LFS getstripe $DIR/$tdir/a1/f1
2050 if [ $MDSCOUNT -ge 2 ]; then
2051 $LFS mkdir -i 1 $DIR/$tdir/a2
2052 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2053 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2054 $LFS getstripe $DIR/$tdir/a2/f2
2057 cancel_lru_locks osc
2059 echo "Inject failure, to simulate the case of missing the MDT-object"
2060 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2061 do_facet mds1 $LCTL set_param fail_loc=0x1616
2062 rm -f $DIR/$tdir/a1/f1
2064 if [ $MDSCOUNT -ge 2 ]; then
2065 do_facet mds2 $LCTL set_param fail_loc=0x1616
2066 rm -f $DIR/$tdir/a2/f2
2072 do_facet mds1 $LCTL set_param fail_loc=0
2073 if [ $MDSCOUNT -ge 2 ]; then
2074 do_facet mds2 $LCTL set_param fail_loc=0
2077 cancel_lru_locks mdc
2078 cancel_lru_locks osc
2080 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2081 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2083 for k in $(seq $MDSCOUNT); do
2084 # The LFSCK status query internal is 30 seconds. For the case
2085 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2086 # time to guarantee the status sync up.
2087 wait_update_facet mds${k} "$LCTL get_param -n \
2088 mdd.$(facet_svc mds${k}).lfsck_layout |
2089 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2090 error "(2) MDS${k} is not the expected 'completed'"
2093 for k in $(seq $OSTCOUNT); do
2094 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2095 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2096 awk '/^status/ { print $2 }')
2097 [ "$cur_status" == "completed" ] ||
2098 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2101 if [ $MDSCOUNT -ge 2 ]; then
2107 local repaired=$(do_facet mds1 $LCTL get_param -n \
2108 mdd.$(facet_svc mds1).lfsck_layout |
2109 awk '/^repaired_orphan/ { print $2 }')
2110 [ $repaired -eq $expected ] ||
2111 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2113 if [ $MDSCOUNT -ge 2 ]; then
2114 repaired=$(do_facet mds2 $LCTL get_param -n \
2115 mdd.$(facet_svc mds2).lfsck_layout |
2116 awk '/^repaired_orphan/ { print $2 }')
2117 [ $repaired -eq 0 ] ||
2118 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2121 ls -ail $MOUNT/.lustre/lost+found/
2123 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2124 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2125 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2127 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2130 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2131 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2132 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2134 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2135 [ ! -z "$cname" ] ||
2136 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2138 run_test 18c "Find out orphan OST-object and repair it (3)"
2142 echo "The target MDT-object layout EA slot is occpuied by some new"
2143 echo "created OST-object when repair dangling reference case. Such"
2144 echo "conflict OST-object has never been modified. Then when found"
2145 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2149 check_mount_and_prep
2151 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2152 echo "guard" > $DIR/$tdir/a1/f1
2153 echo "foo" > $DIR/$tdir/a1/f2
2154 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2155 $LFS path2fid $DIR/$tdir/a1/f1
2156 $LFS getstripe $DIR/$tdir/a1/f1
2157 $LFS path2fid $DIR/$tdir/a1/f2
2158 $LFS getstripe $DIR/$tdir/a1/f2
2159 cancel_lru_locks osc
2161 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2162 echo "to reference the same OST-object (which is f1's OST-obejct)."
2163 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2164 echo "dangling reference case, but f2's old OST-object is there."
2167 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2168 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2169 chown 1.1 $DIR/$tdir/a1/f2
2170 rm -f $DIR/$tdir/a1/f1
2173 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2175 echo "stopall to cleanup object cache"
2178 setupall > /dev/null
2180 echo "The file size should be incorrect since dangling referenced"
2181 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2182 [ "$cur_size" != "$saved_size" ] ||
2183 error "(1) Expect incorrect file2 size"
2185 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2186 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2188 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2189 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2191 wait_update_facet mds1 "$LCTL get_param -n \
2192 mdd.$(facet_svc mds1).lfsck_layout |
2193 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2194 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2196 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2198 for k in $(seq $MDSCOUNT); do
2199 # The LFSCK status query internal is 30 seconds. For the case
2200 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2201 # time to guarantee the status sync up.
2202 wait_update_facet mds${k} "$LCTL get_param -n \
2203 mdd.$(facet_svc mds${k}).lfsck_layout |
2204 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2205 error "(3) MDS${k} is not the expected 'completed'"
2208 for k in $(seq $OSTCOUNT); do
2209 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2210 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2211 awk '/^status/ { print $2 }')
2212 [ "$cur_status" == "completed" ] ||
2213 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2216 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2217 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2218 awk '/^repaired_orphan/ { print $2 }')
2219 [ $repaired -eq 1 ] ||
2220 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2222 echo "The file size should be correct after layout LFSCK scanning"
2223 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2224 [ "$cur_size" == "$saved_size" ] ||
2225 error "(6) Expect file2 size $saved_size, but got $cur_size"
2227 echo "The LFSCK should find back the original data."
2228 cat $DIR/$tdir/a1/f2
2229 $LFS path2fid $DIR/$tdir/a1/f2
2230 $LFS getstripe $DIR/$tdir/a1/f2
2232 run_test 18d "Find out orphan OST-object and repair it (4)"
2236 echo "The target MDT-object layout EA slot is occpuied by some new"
2237 echo "created OST-object when repair dangling reference case. Such"
2238 echo "conflict OST-object has been modified by others. To keep the"
2239 echo "new data, the LFSCK will create a new file to refernece this"
2240 echo "old orphan OST-object."
2243 check_mount_and_prep
2245 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2246 echo "guard" > $DIR/$tdir/a1/f1
2247 echo "foo" > $DIR/$tdir/a1/f2
2248 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2249 $LFS path2fid $DIR/$tdir/a1/f1
2250 $LFS getstripe $DIR/$tdir/a1/f1
2251 $LFS path2fid $DIR/$tdir/a1/f2
2252 $LFS getstripe $DIR/$tdir/a1/f2
2253 cancel_lru_locks osc
2255 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2256 echo "to reference the same OST-object (which is f1's OST-obejct)."
2257 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2258 echo "dangling reference case, but f2's old OST-object is there."
2261 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2263 chown 1.1 $DIR/$tdir/a1/f2
2264 rm -f $DIR/$tdir/a1/f1
2267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2269 echo "stopall to cleanup object cache"
2272 setupall > /dev/null
2274 echo "The file size should be incorrect since dangling referenced"
2275 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2276 [ "$cur_size" != "$saved_size" ] ||
2277 error "(1) Expect incorrect file2 size"
2279 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2280 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2282 start_full_debug_logging
2284 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2285 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2287 wait_update_facet mds1 "$LCTL get_param -n \
2288 mdd.$(facet_svc mds1).lfsck_layout |
2289 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2290 error "(3) MDS1 is not the expected 'scanning-phase2'"
2292 # to guarantee all updates are synced.
2296 echo "Write new data to f2 to modify the new created OST-object."
2297 echo "dummy" >> $DIR/$tdir/a1/f2
2299 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2301 for k in $(seq $MDSCOUNT); do
2302 # The LFSCK status query internal is 30 seconds. For the case
2303 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2304 # time to guarantee the status sync up.
2305 wait_update_facet mds${k} "$LCTL get_param -n \
2306 mdd.$(facet_svc mds${k}).lfsck_layout |
2307 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2308 error "(4) MDS${k} is not the expected 'completed'"
2311 for k in $(seq $OSTCOUNT); do
2312 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2313 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2314 awk '/^status/ { print $2 }')
2315 [ "$cur_status" == "completed" ] ||
2316 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2319 stop_full_debug_logging
2321 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2322 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2323 awk '/^repaired_orphan/ { print $2 }')
2324 [ $repaired -eq 1 ] ||
2325 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2327 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2328 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2329 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2331 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2332 [ ! -z "$cname" ] ||
2333 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2335 echo "The stub file should keep the original f2 data"
2336 cur_size=$(ls -il $cname | awk '{ print $6 }')
2337 [ "$cur_size" == "$saved_size" ] ||
2338 error "(9) Expect file2 size $saved_size, but got $cur_size"
2341 $LFS path2fid $cname
2342 $LFS getstripe $cname
2344 echo "The f2 should contains new data."
2345 cat $DIR/$tdir/a1/f2
2346 $LFS path2fid $DIR/$tdir/a1/f2
2347 $LFS getstripe $DIR/$tdir/a1/f2
2349 run_test 18e "Find out orphan OST-object and repair it (5)"
2352 [ $OSTCOUNT -lt 2 ] &&
2353 skip "The test needs at least 2 OSTs" && return
2356 echo "The target MDT-object is lost. The LFSCK should re-create the"
2357 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2358 echo "to verify some OST-object(s) during the first stage-scanning,"
2359 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2360 echo "should not be affected."
2363 check_mount_and_prep
2364 $LFS mkdir -i 0 $DIR/$tdir/a1
2365 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2366 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2367 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2368 $LFS mkdir -i 0 $DIR/$tdir/a2
2369 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2370 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2371 $LFS getstripe $DIR/$tdir/a1/f1
2372 $LFS getstripe $DIR/$tdir/a2/f2
2374 if [ $MDSCOUNT -ge 2 ]; then
2375 $LFS mkdir -i 1 $DIR/$tdir/a3
2376 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2377 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2378 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2379 $LFS mkdir -i 1 $DIR/$tdir/a4
2380 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2381 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2382 $LFS getstripe $DIR/$tdir/a3/f3
2383 $LFS getstripe $DIR/$tdir/a4/f4
2386 cancel_lru_locks osc
2388 echo "Inject failure, to simulate the case of missing the MDT-object"
2389 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2390 do_facet mds1 $LCTL set_param fail_loc=0x1616
2391 rm -f $DIR/$tdir/a1/f1
2392 rm -f $DIR/$tdir/a2/f2
2394 if [ $MDSCOUNT -ge 2 ]; then
2395 do_facet mds2 $LCTL set_param fail_loc=0x1616
2396 rm -f $DIR/$tdir/a3/f3
2397 rm -f $DIR/$tdir/a4/f4
2403 do_facet mds1 $LCTL set_param fail_loc=0
2404 if [ $MDSCOUNT -ge 2 ]; then
2405 do_facet mds2 $LCTL set_param fail_loc=0
2408 cancel_lru_locks mdc
2409 cancel_lru_locks osc
2411 echo "Inject failure, to simulate the OST0 fail to handle"
2412 echo "MDT0 LFSCK request during the first-stage scanning."
2413 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2414 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2416 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2417 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2419 for k in $(seq $MDSCOUNT); do
2420 # The LFSCK status query internal is 30 seconds. For the case
2421 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2422 # time to guarantee the status sync up.
2423 wait_update_facet mds${k} "$LCTL get_param -n \
2424 mdd.$(facet_svc mds${k}).lfsck_layout |
2425 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2426 error "(2) MDS${k} is not the expected 'partial'"
2429 wait_update_facet ost1 "$LCTL get_param -n \
2430 obdfilter.$(facet_svc ost1).lfsck_layout |
2431 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2432 error "(3) OST1 is not the expected 'partial'"
2435 wait_update_facet ost2 "$LCTL get_param -n \
2436 obdfilter.$(facet_svc ost2).lfsck_layout |
2437 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2438 error "(4) OST2 is not the expected 'completed'"
2441 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2443 local repaired=$(do_facet mds1 $LCTL get_param -n \
2444 mdd.$(facet_svc mds1).lfsck_layout |
2445 awk '/^repaired_orphan/ { print $2 }')
2446 [ $repaired -eq 1 ] ||
2447 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2449 if [ $MDSCOUNT -ge 2 ]; then
2450 repaired=$(do_facet mds2 $LCTL get_param -n \
2451 mdd.$(facet_svc mds2).lfsck_layout |
2452 awk '/^repaired_orphan/ { print $2 }')
2453 [ $repaired -eq 1 ] ||
2454 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2457 echo "Trigger layout LFSCK on all devices again to cleanup"
2458 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2460 for k in $(seq $MDSCOUNT); do
2461 # The LFSCK status query internal is 30 seconds. For the case
2462 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2463 # time to guarantee the status sync up.
2464 wait_update_facet mds${k} "$LCTL get_param -n \
2465 mdd.$(facet_svc mds${k}).lfsck_layout |
2466 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2467 error "(8) MDS${k} is not the expected 'completed'"
2470 for k in $(seq $OSTCOUNT); do
2471 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2472 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2473 awk '/^status/ { print $2 }')
2474 [ "$cur_status" == "completed" ] ||
2475 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2479 local repaired=$(do_facet mds1 $LCTL get_param -n \
2480 mdd.$(facet_svc mds1).lfsck_layout |
2481 awk '/^repaired_orphan/ { print $2 }')
2482 [ $repaired -eq 2 ] ||
2483 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2485 if [ $MDSCOUNT -ge 2 ]; then
2486 repaired=$(do_facet mds2 $LCTL get_param -n \
2487 mdd.$(facet_svc mds2).lfsck_layout |
2488 awk '/^repaired_orphan/ { print $2 }')
2489 [ $repaired -eq 2 ] ||
2490 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2493 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2495 $LCTL set_param debug=-cache > /dev/null
2498 check_mount_and_prep
2499 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2501 echo "foo" > $DIR/$tdir/a0
2502 echo "guard" > $DIR/$tdir/a1
2503 cancel_lru_locks osc
2505 echo "Inject failure, then client will offer wrong parent FID when read"
2506 do_facet ost1 $LCTL set_param -n \
2507 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2508 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2509 $LCTL set_param fail_loc=0x1619
2511 echo "Read RPC with wrong parent FID should be denied"
2512 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2513 $LCTL set_param fail_loc=0
2515 run_test 19a "OST-object inconsistency self detect"
2518 check_mount_and_prep
2519 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2521 echo "Inject failure stub to make the OST-object to back point to"
2522 echo "non-exist MDT-object"
2524 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2525 do_facet ost1 $LCTL set_param fail_loc=0x1611
2526 echo "foo" > $DIR/$tdir/f0
2527 cancel_lru_locks osc
2528 do_facet ost1 $LCTL set_param fail_loc=0
2530 echo "Nothing should be fixed since self detect and repair is disabled"
2531 local repaired=$(do_facet ost1 $LCTL get_param -n \
2532 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2533 awk '/^repaired/ { print $2 }')
2534 [ $repaired -eq 0 ] ||
2535 error "(1) Expected 0 repaired, but got $repaired"
2537 echo "Read RPC with right parent FID should be accepted,"
2538 echo "and cause parent FID on OST to be fixed"
2540 do_facet ost1 $LCTL set_param -n \
2541 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2542 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2544 repaired=$(do_facet ost1 $LCTL get_param -n \
2545 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2546 awk '/^repaired/ { print $2 }')
2547 [ $repaired -eq 1 ] ||
2548 error "(3) Expected 1 repaired, but got $repaired"
2550 run_test 19b "OST-object inconsistency self repair"
2553 [ $OSTCOUNT -lt 2 ] &&
2554 skip "The test needs at least 2 OSTs" && return
2557 echo "The target MDT-object and some of its OST-object are lost."
2558 echo "The LFSCK should find out the left OST-objects and re-create"
2559 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2560 echo "with the partial OST-objects (LOV EA hole)."
2562 echo "New client can access the file with LOV EA hole via normal"
2563 echo "system tools or commands without crash the system."
2565 echo "For old client, even though it cannot access the file with"
2566 echo "LOV EA hole, it should not cause the system crash."
2569 check_mount_and_prep
2570 $LFS mkdir -i 0 $DIR/$tdir/a1
2571 if [ $OSTCOUNT -gt 2 ]; then
2572 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2575 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2579 # 256 blocks on the stripe0.
2580 # 1 block on the stripe1 for 2 OSTs case.
2581 # 256 blocks on the stripe1 for other cases.
2582 # 1 block on the stripe2 if OSTs > 2
2583 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2584 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2585 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2587 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2588 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2589 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2592 $LFS getstripe $DIR/$tdir/a1/f0
2594 $LFS getstripe $DIR/$tdir/a1/f1
2596 $LFS getstripe $DIR/$tdir/a1/f2
2598 if [ $OSTCOUNT -gt 2 ]; then
2599 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2600 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2602 $LFS getstripe $DIR/$tdir/a1/f3
2605 cancel_lru_locks osc
2607 echo "Inject failure..."
2608 echo "To simulate f0 lost MDT-object"
2609 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2610 do_facet mds1 $LCTL set_param fail_loc=0x1616
2611 rm -f $DIR/$tdir/a1/f0
2613 echo "To simulate f1 lost MDT-object and OST-object0"
2614 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2615 do_facet mds1 $LCTL set_param fail_loc=0x161a
2616 rm -f $DIR/$tdir/a1/f1
2618 echo "To simulate f2 lost MDT-object and OST-object1"
2619 do_facet mds1 $LCTL set_param fail_val=1
2620 rm -f $DIR/$tdir/a1/f2
2622 if [ $OSTCOUNT -gt 2 ]; then
2623 echo "To simulate f3 lost MDT-object and OST-object2"
2624 do_facet mds1 $LCTL set_param fail_val=2
2625 rm -f $DIR/$tdir/a1/f3
2628 umount_client $MOUNT
2631 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2633 echo "Inject failure to slow down the LFSCK on OST0"
2634 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2635 do_facet ost1 $LCTL set_param fail_loc=0x161b
2637 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2638 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2641 do_facet ost1 $LCTL set_param fail_loc=0
2643 for k in $(seq $MDSCOUNT); do
2644 # The LFSCK status query internal is 30 seconds. For the case
2645 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2646 # time to guarantee the status sync up.
2647 wait_update_facet mds${k} "$LCTL get_param -n \
2648 mdd.$(facet_svc mds${k}).lfsck_layout |
2649 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2650 error "(2) MDS${k} is not the expected 'completed'"
2653 for k in $(seq $OSTCOUNT); do
2654 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2655 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2656 awk '/^status/ { print $2 }')
2657 [ "$cur_status" == "completed" ] ||
2658 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2661 local repaired=$(do_facet mds1 $LCTL get_param -n \
2662 mdd.$(facet_svc mds1).lfsck_layout |
2663 awk '/^repaired_orphan/ { print $2 }')
2664 if [ $OSTCOUNT -gt 2 ]; then
2665 [ $repaired -eq 9 ] ||
2666 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2668 [ $repaired -eq 4 ] ||
2669 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2672 mount_client $MOUNT || error "(5.0) Fail to start client!"
2674 LOV_PATTERN_F_HOLE=0x40000000
2677 # ${fid0}-R-0 is the old f0
2679 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2680 echo "Check $name, which is the old f0"
2682 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2684 local pattern=0x$($LFS getstripe -L $name)
2685 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2686 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2688 local stripes=$($LFS getstripe -c $name)
2689 if [ $OSTCOUNT -gt 2 ]; then
2690 [ $stripes -eq 3 ] ||
2691 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2693 [ $stripes -eq 2 ] ||
2694 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2697 local size=$(stat $name | awk '/Size:/ { print $2 }')
2698 [ $size -eq $((4096 * $bcount)) ] ||
2699 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2701 cat $name > /dev/null || error "(5.5) cannot read $name"
2703 echo "dummy" >> $name || error "(5.6) cannot write $name"
2705 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2707 touch $name || error "(5.8) cannot touch $name"
2709 rm -f $name || error "(5.9) cannot unlink $name"
2712 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2714 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2715 if [ $OSTCOUNT -gt 2 ]; then
2716 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2718 echo "Check $name, it contains the old f1's stripe1"
2721 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2723 pattern=0x$($LFS getstripe -L $name)
2724 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2725 error "(6.2) expect pattern flag hole, but got $pattern"
2727 stripes=$($LFS getstripe -c $name)
2728 if [ $OSTCOUNT -gt 2 ]; then
2729 [ $stripes -eq 3 ] ||
2730 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2732 [ $stripes -eq 2 ] ||
2733 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2736 size=$(stat $name | awk '/Size:/ { print $2 }')
2737 [ $size -eq $((4096 * $bcount)) ] ||
2738 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2740 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2742 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2743 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2746 [ $failures -eq 256 ] ||
2747 error "(6.6) expect 256 IO failures, but get $failures"
2749 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2750 [ $size -eq $((4096 * $bcount)) ] ||
2751 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2753 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2754 error "(6.8) write to the LOV EA hole should fail"
2756 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2757 error "(6.9) write to normal stripe should NOT fail"
2759 echo "foo" >> $name && error "(6.10) append write $name should fail"
2761 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2763 touch $name || error "(6.12) cannot touch $name"
2765 rm -f $name || error "(6.13) cannot unlink $name"
2768 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2770 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2771 if [ $OSTCOUNT -gt 2 ]; then
2772 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2774 echo "Check $name, it contains the old f2's stripe0"
2777 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2779 pattern=0x$($LFS getstripe -L $name)
2780 stripes=$($LFS getstripe -c $name)
2781 size=$(stat $name | awk '/Size:/ { print $2 }')
2782 if [ $OSTCOUNT -gt 2 ]; then
2783 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2784 error "(7.2.1) expect pattern flag hole, but got $pattern"
2786 [ $stripes -eq 3 ] ||
2787 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2789 [ $size -eq $((4096 * $bcount)) ] ||
2790 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2792 cat $name > /dev/null &&
2793 error "(7.5.1) normal read $name should fail"
2795 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2796 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2798 [ $failures -eq 256 ] ||
2799 error "(7.6) expect 256 IO failures, but get $failures"
2801 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2802 [ $size -eq $((4096 * $bcount)) ] ||
2803 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2805 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2806 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2808 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2809 error "(7.8.1) write to normal stripe should NOT fail"
2811 echo "foo" >> $name &&
2812 error "(7.8.3) append write $name should fail"
2814 chown $RUNAS_ID:$RUNAS_GID $name ||
2815 error "(7.9.1) cannot chown on $name"
2817 touch $name || error "(7.10.1) cannot touch $name"
2819 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2820 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2822 [ $stripes -eq 1 ] ||
2823 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2826 [ $size -eq $((4096 * (256 + 0))) ] ||
2827 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2829 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2831 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2833 chown $RUNAS_ID:$RUNAS_GID $name ||
2834 error "(7.9.2) cannot chown on $name"
2836 touch $name || error "(7.10.2) cannot touch $name"
2839 rm -f $name || error "(7.11) cannot unlink $name"
2841 [ $OSTCOUNT -le 2 ] && return
2844 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2846 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2847 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2849 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2851 pattern=0x$($LFS getstripe -L $name)
2852 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2853 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2855 stripes=$($LFS getstripe -c $name)
2856 # LFSCK does not know the old f3 had 3 stripes.
2857 # It only tries to find as much as possible.
2858 # The stripe count depends on the last stripe's offset.
2859 [ $stripes -eq 2 ] ||
2860 error "(8.3) expect the stripe count is 2, but got $stripes"
2862 size=$(stat $name | awk '/Size:/ { print $2 }')
2864 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2865 error "(8.4) expect the size $((4096 * 512)), but got $size"
2867 cat $name > /dev/null || error "(8.5) cannot read $name"
2869 echo "dummy" >> $name || error "(8.6) cannot write $name"
2871 chown $RUNAS_ID:$RUNAS_GID $name ||
2872 error "(8.7) cannot chown on $name"
2874 touch $name || error "(8.8) cannot touch $name"
2876 rm -f $name || error "(8.9) cannot unlink $name"
2878 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2881 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2882 skip "ignore the test if MDS is older than 2.5.59" && return
2884 check_mount_and_prep
2885 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2887 echo "Start all LFSCK components by default (-s 1)"
2888 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2889 error "Fail to start LFSCK"
2891 echo "namespace LFSCK should be in 'scanning-phase1' status"
2892 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2893 [ "$STATUS" == "scanning-phase1" ] ||
2894 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2896 echo "layout LFSCK should be in 'scanning-phase1' status"
2897 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2898 [ "$STATUS" == "scanning-phase1" ] ||
2899 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2901 echo "Stop all LFSCK components by default"
2902 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2903 error "Fail to stop LFSCK"
2905 run_test 21 "run all LFSCK components by default"
2908 [ $MDSCOUNT -lt 2 ] &&
2909 skip "We need at least 2 MDSes for this test" && return
2912 echo "The parent_A references the child directory via some name entry,"
2913 echo "but the child directory back references another parent_B via its"
2914 echo "".." name entry. The parent_B does not exist. Then the namespace"
2915 echo "LFSCK will repair the child directory's ".." name entry."
2918 check_mount_and_prep
2920 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2921 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2923 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2924 echo "The dummy's dotdot name entry references the guard."
2925 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2927 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2928 error "(3) Fail to mkdir on MDT0"
2929 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2931 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2933 echo "Trigger namespace LFSCK to repair unmatched pairs"
2934 $START_NAMESPACE -A -r ||
2935 error "(5) Fail to start LFSCK for namespace"
2937 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2938 mdd.${MDT_DEV}.lfsck_namespace |
2939 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2941 error "(6) unexpected status"
2944 local repaired=$($SHOW_NAMESPACE |
2945 awk '/^unmatched_pairs_repaired/ { print $2 }')
2946 [ $repaired -eq 1 ] ||
2947 error "(7) Fail to repair unmatched pairs: $repaired"
2949 echo "'ls' should success after namespace LFSCK repairing"
2950 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2951 error "(8) ls should success."
2953 run_test 22a "LFSCK can repair unmatched pairs (1)"
2956 [ $MDSCOUNT -lt 2 ] &&
2957 skip "We need at least 2 MDSes for this test" && return
2960 echo "The parent_A references the child directory via the name entry_B,"
2961 echo "but the child directory back references another parent_C via its"
2962 echo "".." name entry. The parent_C exists, but there is no the name"
2963 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2964 echo "the child directory's ".." name entry and its linkEA."
2967 check_mount_and_prep
2969 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2970 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2972 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2973 echo "and bad linkEA. The dummy's dotdot name entry references the"
2974 echo "guard. The dummy's linkEA references n non-exist name entry."
2975 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2976 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2977 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2978 error "(3) Fail to mkdir on MDT0"
2979 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2981 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2982 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2983 local dummyname=$($LFS fid2path $DIR $dummyfid)
2984 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2985 error "(4) fid2path works unexpectedly."
2987 echo "Trigger namespace LFSCK to repair unmatched pairs"
2988 $START_NAMESPACE -A -r ||
2989 error "(5) Fail to start LFSCK for namespace"
2991 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2992 mdd.${MDT_DEV}.lfsck_namespace |
2993 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2995 error "(6) unexpected status"
2998 local repaired=$($SHOW_NAMESPACE |
2999 awk '/^unmatched_pairs_repaired/ { print $2 }')
3000 [ $repaired -eq 1 ] ||
3001 error "(7) Fail to repair unmatched pairs: $repaired"
3003 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3004 local dummyname=$($LFS fid2path $DIR $dummyfid)
3005 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3006 error "(8) fid2path does not work"
3008 run_test 22b "LFSCK can repair unmatched pairs (2)"
3011 [ $MDSCOUNT -lt 2 ] &&
3012 skip "We need at least 2 MDSes for this test" && return
3015 echo "The name entry is there, but the MDT-object for such name "
3016 echo "entry does not exist. The namespace LFSCK should find out "
3017 echo "and repair the inconsistency as required."
3020 check_mount_and_prep
3022 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3023 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3025 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3026 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3027 do_facet mds2 $LCTL set_param fail_loc=0x1620
3028 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3029 do_facet mds2 $LCTL set_param fail_loc=0
3031 echo "'ls' should fail because of dangling name entry"
3032 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3034 echo "Trigger namespace LFSCK to find out dangling name entry"
3035 $START_NAMESPACE -A -r ||
3036 error "(5) Fail to start LFSCK for namespace"
3038 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3039 mdd.${MDT_DEV}.lfsck_namespace |
3040 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3042 error "(6) unexpected status"
3045 local repaired=$($SHOW_NAMESPACE |
3046 awk '/^dangling_repaired/ { print $2 }')
3047 [ $repaired -eq 1 ] ||
3048 error "(7) Fail to repair dangling name entry: $repaired"
3050 echo "'ls' should fail because not re-create MDT-object by default"
3051 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3053 echo "Trigger namespace LFSCK again to repair dangling name entry"
3054 $START_NAMESPACE -A -r -C ||
3055 error "(9) Fail to start LFSCK for namespace"
3057 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3058 mdd.${MDT_DEV}.lfsck_namespace |
3059 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3061 error "(10) unexpected status"
3064 repaired=$($SHOW_NAMESPACE |
3065 awk '/^dangling_repaired/ { print $2 }')
3066 [ $repaired -eq 1 ] ||
3067 error "(11) Fail to repair dangling name entry: $repaired"
3069 echo "'ls' should success after namespace LFSCK repairing"
3070 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3072 run_test 23a "LFSCK can repair dangling name entry (1)"
3076 echo "The objectA has multiple hard links, one of them corresponding"
3077 echo "to the name entry_B. But there is something wrong for the name"
3078 echo "entry_B and cause entry_B to references non-exist object_C."
3079 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3080 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3081 echo "comes to the second-stage scanning, it will find that the"
3082 echo "former re-creating object_C is not proper, and will try to"
3083 echo "replace the object_C with the real object_A."
3086 check_mount_and_prep
3088 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3089 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3090 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3092 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3093 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3094 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3095 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3096 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3098 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3100 echo "'ls' should fail because of dangling name entry"
3101 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3102 error "(6) ls should fail."
3104 echo "Trigger namespace LFSCK to find out dangling name entry"
3105 $START_NAMESPACE -r -C ||
3106 error "(7) Fail to start LFSCK for namespace"
3108 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3109 mdd.${MDT_DEV}.lfsck_namespace |
3110 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3112 error "(8) unexpected status"
3115 local repaired=$($SHOW_NAMESPACE |
3116 awk '/^dangling_repaired/ { print $2 }')
3117 [ $repaired -eq 1 ] ||
3118 error "(9) Fail to repair dangling name entry: $repaired"
3120 repaired=$($SHOW_NAMESPACE |
3121 awk '/^multiple_linked_repaired/ { print $2 }')
3122 [ $repaired -eq 1 ] ||
3123 error "(10) Fail to drop the former created object: $repaired"
3125 local data=$(cat $DIR/$tdir/d0/foo)
3126 [ "$data" == "dummy" ] ||
3127 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3129 run_test 23b "LFSCK can repair dangling name entry (2)"
3133 echo "The objectA has multiple hard links, one of them corresponding"
3134 echo "to the name entry_B. But there is something wrong for the name"
3135 echo "entry_B and cause entry_B to references non-exist object_C."
3136 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3137 echo "as dangling, and re-create the lost object_C. And then others"
3138 echo "modified the re-created object_C. When the LFSCK comes to the"
3139 echo "second-stage scanning, it will find that the former re-creating"
3140 echo "object_C maybe wrong and try to replace the object_C with the"
3141 echo "real object_A. But because object_C has been modified, so the"
3142 echo "LFSCK cannot replace it."
3145 check_mount_and_prep
3147 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3148 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3149 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3151 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3152 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3153 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3154 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3157 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3159 echo "'ls' should fail because of dangling name entry"
3160 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3161 error "(6) ls should fail."
3163 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3164 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3166 echo "Trigger namespace LFSCK to find out dangling name entry"
3167 $START_NAMESPACE -r -C ||
3168 error "(7) Fail to start LFSCK for namespace"
3170 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3171 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3172 stat $DIR/$tdir/guard
3174 error "(8) unexpected size"
3177 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3178 cancel_lru_locks osc
3180 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3181 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3182 mdd.${MDT_DEV}.lfsck_namespace |
3183 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3185 error "(10) unexpected status"
3188 local repaired=$($SHOW_NAMESPACE |
3189 awk '/^dangling_repaired/ { print $2 }')
3190 [ $repaired -eq 1 ] ||
3191 error "(11) Fail to repair dangling name entry: $repaired"
3193 local data=$(cat $DIR/$tdir/d0/foo)
3194 [ "$data" != "dummy" ] ||
3195 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3197 run_test 23c "LFSCK can repair dangling name entry (3)"
3200 [ $MDSCOUNT -lt 2 ] &&
3201 skip "We need at least 2 MDSes for this test" && return
3204 echo "Two MDT-objects back reference the same name entry via their"
3205 echo "each own linkEA entry, but the name entry only references one"
3206 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3207 echo "for the MDT-object that is not recognized. If such MDT-object"
3208 echo "has no other linkEA entry after the removing, then the LFSCK"
3209 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3212 check_mount_and_prep
3214 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3216 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3217 $LFS path2fid $DIR/$tdir/d0/guard
3219 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3220 $LFS path2fid $DIR/$tdir/d0/dummy
3223 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3224 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3226 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3229 touch $DIR/$tdir/d0/guard/foo ||
3230 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3232 echo "Inject failure stub on MDT0 to simulate the case that"
3233 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3234 echo "that references $DIR/$tdir/d0/guard/foo."
3235 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3236 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3237 echo "there with the same linkEA entry as another MDT-object"
3238 echo "$DIR/$tdir/d0/guard/foo has"
3240 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3242 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3243 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3244 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3245 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3246 rmdir $DIR/$tdir/d0/dummy/foo ||
3247 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3250 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3251 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3252 error "(6) stat successfully unexpectedly"
3254 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3255 $START_NAMESPACE -A -r ||
3256 error "(7) Fail to start LFSCK for namespace"
3258 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3259 mdd.${MDT_DEV}.lfsck_namespace |
3260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3262 error "(8) unexpected status"
3265 local repaired=$($SHOW_NAMESPACE |
3266 awk '/^multiple_referenced_repaired/ { print $2 }')
3267 [ $repaired -eq 1 ] ||
3268 error "(9) Fail to repair multiple referenced name entry: $repaired"
3270 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3271 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3272 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3274 local cname="$cfid-$pfid-D-0"
3275 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3276 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3278 run_test 24 "LFSCK can repair multiple-referenced name entry"
3281 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3282 skip "Only support to inject failure on ldiskfs" && return
3285 echo "The file type in the name entry does not match the file type"
3286 echo "claimed by the referenced object. Then the LFSCK will update"
3287 echo "the file type in the name entry."
3290 check_mount_and_prep
3292 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3294 echo "Inject failure stub on MDT0 to simulate the case that"
3295 echo "the file type stored in the name entry is wrong."
3297 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3299 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3300 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3302 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3303 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3305 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3306 mdd.${MDT_DEV}.lfsck_namespace |
3307 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3309 error "(4) unexpected status"
3312 local repaired=$($SHOW_NAMESPACE |
3313 awk '/^bad_file_type_repaired/ { print $2 }')
3314 [ $repaired -eq 1 ] ||
3315 error "(5) Fail to repair bad file type in name entry: $repaired"
3317 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3319 run_test 25 "LFSCK can repair bad file type in the name entry"
3323 echo "The local name entry back referenced by the MDT-object is lost."
3324 echo "The namespace LFSCK will add the missing local name entry back"
3325 echo "to the normal namespace."
3328 check_mount_and_prep
3330 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3331 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3332 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3334 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3335 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3337 echo "Inject failure stub on MDT0 to simulate the case that"
3338 echo "foo's name entry will be removed, but the foo's object"
3339 echo "and its linkEA are kept in the system."
3341 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3342 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3343 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3346 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3348 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3349 $START_NAMESPACE -r -A ||
3350 error "(6) Fail to start LFSCK for namespace"
3352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3353 mdd.${MDT_DEV}.lfsck_namespace |
3354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3356 error "(7) unexpected status"
3359 local repaired=$($SHOW_NAMESPACE |
3360 awk '/^lost_dirent_repaired/ { print $2 }')
3361 [ $repaired -eq 1 ] ||
3362 error "(8) Fail to repair lost dirent: $repaired"
3364 ls -ail $DIR/$tdir/d0/foo ||
3365 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3367 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3368 [ "$foofid" == "$foofid2" ] ||
3369 error "(10) foo's FID changed: $foofid, $foofid2"
3371 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3374 [ $MDSCOUNT -lt 2 ] &&
3375 skip "We need at least 2 MDSes for this test" && return
3378 echo "The remote name entry back referenced by the MDT-object is lost."
3379 echo "The namespace LFSCK will add the missing remote name entry back"
3380 echo "to the normal namespace."
3383 check_mount_and_prep
3385 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3386 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3387 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3389 echo "Inject failure stub on MDT0 to simulate the case that"
3390 echo "foo's name entry will be removed, but the foo's object"
3391 echo "and its linkEA are kept in the system."
3393 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3394 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3395 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3396 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3398 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3400 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3401 $START_NAMESPACE -r -A ||
3402 error "(5) Fail to start LFSCK for namespace"
3404 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3405 mdd.${MDT_DEV}.lfsck_namespace |
3406 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3408 error "(6) unexpected status"
3411 local repaired=$($SHOW_NAMESPACE |
3412 awk '/^lost_dirent_repaired/ { print $2 }')
3413 [ $repaired -eq 1 ] ||
3414 error "(7) Fail to repair lost dirent: $repaired"
3416 ls -ail $DIR/$tdir/d0/foo ||
3417 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3419 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3420 [ "$foofid" == "$foofid2" ] ||
3421 error "(9) foo's FID changed: $foofid, $foofid2"
3423 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3427 echo "The local parent referenced by the MDT-object linkEA is lost."
3428 echo "The namespace LFSCK will re-create the lost parent as orphan."
3431 check_mount_and_prep
3433 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3434 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3435 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3436 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3438 echo "Inject failure stub on MDT0 to simulate the case that"
3439 echo "foo's name entry will be removed, but the foo's object"
3440 echo "and its linkEA are kept in the system. And then remove"
3441 echo "another hard link and the parent directory."
3443 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3445 rm -f $DIR/$tdir/d0/foo ||
3446 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3447 rm -f $DIR/$tdir/d0/dummy ||
3448 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3451 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3452 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3454 echo "Trigger namespace LFSCK to repair the lost parent"
3455 $START_NAMESPACE -r -A ||
3456 error "(6) Fail to start LFSCK for namespace"
3458 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3459 mdd.${MDT_DEV}.lfsck_namespace |
3460 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3462 error "(7) unexpected status"
3465 local repaired=$($SHOW_NAMESPACE |
3466 awk '/^lost_dirent_repaired/ { print $2 }')
3467 [ $repaired -eq 1 ] ||
3468 error "(8) Fail to repair lost dirent: $repaired"
3470 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3471 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3472 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3474 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3476 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3477 [ ! -z "$cname" ] ||
3478 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3480 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3483 [ $MDSCOUNT -lt 2 ] &&
3484 skip "We need at least 2 MDSes for this test" && return
3487 echo "The remote parent referenced by the MDT-object linkEA is lost."
3488 echo "The namespace LFSCK will re-create the lost parent as orphan."
3491 check_mount_and_prep
3493 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3494 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3496 $LFS path2fid $DIR/$tdir/d0
3498 echo "Inject failure stub on MDT0 to simulate the case that"
3499 echo "foo's name entry will be removed, but the foo's object"
3500 echo "and its linkEA are kept in the system. And then remove"
3501 echo "the parent directory."
3503 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3505 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3506 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3508 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3509 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3511 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3512 $START_NAMESPACE -r -A ||
3513 error "(6) Fail to start LFSCK for namespace"
3515 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3516 mdd.${MDT_DEV}.lfsck_namespace |
3517 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3519 error "(7) unexpected status"
3522 local repaired=$($SHOW_NAMESPACE |
3523 awk '/^lost_dirent_repaired/ { print $2 }')
3524 [ $repaired -eq 1 ] ||
3525 error "(8) Fail to repair lost dirent: $repaired"
3527 ls -ail $MOUNT/.lustre/lost+found/
3529 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3530 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3531 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3533 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3535 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3536 [ ! -z "$cname" ] ||
3537 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3539 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3542 [ $MDSCOUNT -lt 2 ] &&
3543 skip "The test needs at least 2 MDTs" && return
3546 echo "The target name entry is lost. The LFSCK should insert the"
3547 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3548 echo "the MDT (on which the orphan MDT-object resides) has ever"
3549 echo "failed to respond some name entry verification during the"
3550 echo "first stage-scanning, then the LFSCK should skip to handle"
3551 echo "orphan MDT-object on this MDT. But other MDTs should not"
3555 check_mount_and_prep
3556 $LFS mkdir -i 0 $DIR/$tdir/d1
3557 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3558 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3560 $LFS mkdir -i 1 $DIR/$tdir/d2
3561 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3562 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3564 echo "Inject failure stub on MDT0 to simulate the case that"
3565 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3566 echo "and its linkEA are kept in the system. And the case that"
3567 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3568 echo "and its linkEA are kept in the system."
3570 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3571 do_facet mds1 $LCTL set_param fail_loc=0x1624
3572 do_facet mds2 $LCTL set_param fail_loc=0x1624
3573 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3574 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3575 do_facet mds1 $LCTL set_param fail_loc=0
3576 do_facet mds2 $LCTL set_param fail_loc=0
3578 cancel_lru_locks mdc
3579 cancel_lru_locks osc
3581 echo "Inject failure, to simulate the MDT0 fail to handle"
3582 echo "MDT1 LFSCK request during the first-stage scanning."
3583 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3584 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3586 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3587 $START_NAMESPACE -r -A ||
3588 error "(3) Fail to start LFSCK for namespace"
3590 wait_update_facet mds1 "$LCTL get_param -n \
3591 mdd.$(facet_svc mds1).lfsck_namespace |
3592 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3593 error "(4) mds1 is not the expected 'partial'"
3596 wait_update_facet mds2 "$LCTL get_param -n \
3597 mdd.$(facet_svc mds2).lfsck_namespace |
3598 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3599 error "(5) mds2 is not the expected 'completed'"
3602 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3604 local repaired=$(do_facet mds1 $LCTL get_param -n \
3605 mdd.$(facet_svc mds1).lfsck_namespace |
3606 awk '/^lost_dirent_repaired/ { print $2 }')
3607 [ $repaired -eq 0 ] ||
3608 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3610 repaired=$(do_facet mds2 $LCTL get_param -n \
3611 mdd.$(facet_svc mds2).lfsck_namespace |
3612 awk '/^lost_dirent_repaired/ { print $2 }')
3613 [ $repaired -eq 1 ] ||
3614 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3616 echo "Trigger namespace LFSCK on all devices again to cleanup"
3617 $START_NAMESPACE -r -A ||
3618 error "(8) Fail to start LFSCK for namespace"
3620 for k in $(seq $MDSCOUNT); do
3621 # The LFSCK status query internal is 30 seconds. For the case
3622 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3623 # time to guarantee the status sync up.
3624 wait_update_facet mds${k} "$LCTL get_param -n \
3625 mdd.$(facet_svc mds${k}).lfsck_namespace |
3626 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3627 error "(9) MDS${k} is not the expected 'completed'"
3630 local repaired=$(do_facet mds1 $LCTL get_param -n \
3631 mdd.$(facet_svc mds1).lfsck_namespace |
3632 awk '/^lost_dirent_repaired/ { print $2 }')
3633 [ $repaired -eq 1 ] ||
3634 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3636 repaired=$(do_facet mds2 $LCTL get_param -n \
3637 mdd.$(facet_svc mds2).lfsck_namespace |
3638 awk '/^lost_dirent_repaired/ { print $2 }')
3639 [ $repaired -eq 0 ] ||
3640 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3642 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3646 echo "The object's nlink attribute is larger than the object's known"
3647 echo "name entries count. The LFSCK will repair the object's nlink"
3648 echo "attribute to match the known name entries count"
3651 check_mount_and_prep
3653 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3654 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3656 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3657 echo "nlink attribute is larger than its name entries count."
3659 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3661 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3662 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3663 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3665 cancel_lru_locks mdc
3666 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3667 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3669 echo "Trigger namespace LFSCK to repair the nlink count"
3670 $START_NAMESPACE -r -A ||
3671 error "(5) Fail to start LFSCK for namespace"
3673 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3674 mdd.${MDT_DEV}.lfsck_namespace |
3675 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3677 error "(6) unexpected status"
3680 local repaired=$($SHOW_NAMESPACE |
3681 awk '/^nlinks_repaired/ { print $2 }')
3682 [ $repaired -eq 1 ] ||
3683 error "(7) Fail to repair nlink count: $repaired"
3685 cancel_lru_locks mdc
3686 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3687 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3689 run_test 29a "LFSCK can repair bad nlink count (1)"
3693 echo "The object's nlink attribute is smaller than the object's known"
3694 echo "name entries count. The LFSCK will repair the object's nlink"
3695 echo "attribute to match the known name entries count"
3698 check_mount_and_prep
3700 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3701 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3703 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3704 echo "nlink attribute is smaller than its name entries count."
3706 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3708 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3709 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3710 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3712 cancel_lru_locks mdc
3713 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3714 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3716 echo "Trigger namespace LFSCK to repair the nlink count"
3717 $START_NAMESPACE -r -A ||
3718 error "(5) Fail to start LFSCK for namespace"
3720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3721 mdd.${MDT_DEV}.lfsck_namespace |
3722 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3724 error "(6) unexpected status"
3727 local repaired=$($SHOW_NAMESPACE |
3728 awk '/^nlinks_repaired/ { print $2 }')
3729 [ $repaired -eq 1 ] ||
3730 error "(7) Fail to repair nlink count: $repaired"
3732 cancel_lru_locks mdc
3733 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3734 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3736 run_test 29b "LFSCK can repair bad nlink count (2)"
3740 echo "There are too many hard links to the object, and exceeds the"
3741 echo "object's linkEA limitation, as to NOT all the known name entries"
3742 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3743 echo "skip the nlink verification for this object."
3746 check_mount_and_prep
3748 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3749 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3750 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3751 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3753 echo "Inject failure stub on MDT0 to simulate the case that"
3754 echo "foo's hard links exceed the object's linkEA limitation."
3756 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3758 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3759 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3761 cancel_lru_locks mdc
3763 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3764 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3766 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3767 $LFS fid2path $DIR $foofid
3768 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3769 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3771 echo "Trigger namespace LFSCK to repair the nlink count"
3772 $START_NAMESPACE -r -A ||
3773 error "(7) Fail to start LFSCK for namespace"
3775 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3776 mdd.${MDT_DEV}.lfsck_namespace |
3777 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3779 error "(8) unexpected status"
3782 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3783 local repaired=$($SHOW_NAMESPACE |
3784 awk '/^nlinks_repaired/ { print $2 }')
3785 [ $repaired -eq 0 ] ||
3786 error "(9) Repair nlink count unexpcetedly: $repaired"
3788 cancel_lru_locks mdc
3790 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3791 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3793 count2=$($LFS fid2path $DIR $foofid | wc -l)
3794 [ $count2 -eq 2 ] ||
3795 error "(11) Repaired something unexpectedly: $count2"
3797 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3800 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3801 skip "Only support backend /lost+found for ldiskfs" && return
3804 echo "The namespace LFSCK will move the orphans from backend"
3805 echo "/lost+found directory to normal client visible namespace"
3806 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3809 check_mount_and_prep
3811 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3812 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3814 echo "Inject failure stub on MDT0 to simulate the case that"
3815 echo "directory d0 has no linkEA entry, then the LFSCK will"
3816 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3818 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3819 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3820 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3823 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3824 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3826 echo "Inject failure stub on MDT0 to simulate the case that the"
3827 echo "object's name entry will be removed, but not destroy the"
3828 echo "object. Then backend e2fsck will handle it as orphan and"
3829 echo "add them into the backend /lost+found directory."
3831 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3832 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3833 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3834 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3835 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3836 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3839 umount_client $MOUNT || error "(10) Fail to stop client!"
3841 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3844 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3845 error "(12) Fail to run e2fsck"
3847 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3848 error "(13) Fail to start MDT0"
3850 echo "Trigger namespace LFSCK to recover backend orphans"
3851 $START_NAMESPACE -r -A ||
3852 error "(14) Fail to start LFSCK for namespace"
3854 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3855 mdd.${MDT_DEV}.lfsck_namespace |
3856 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3858 error "(15) unexpected status"
3861 local repaired=$($SHOW_NAMESPACE |
3862 awk '/^local_lost_found_moved/ { print $2 }')
3863 [ $repaired -ge 4 ] ||
3864 error "(16) Fail to recover backend orphans: $repaired"
3866 mount_client $MOUNT || error "(17) Fail to start client!"
3868 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3870 ls -ail $MOUNT/.lustre/lost+found/
3872 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3873 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3874 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3876 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3878 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3879 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3881 stat ${cname}/d1 || error "(21) d0 is not recovered"
3882 stat ${cname}/f1 || error "(22) f1 is not recovered"
3884 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3887 [ $MDSCOUNT -lt 2 ] &&
3888 skip "The test needs at least 2 MDTs" && return
3891 echo "For the name entry under a striped directory, if the name"
3892 echo "hash does not match the shard, then the LFSCK will repair"
3893 echo "the bad name entry"
3896 check_mount_and_prep
3898 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3899 error "(1) Fail to create striped directory"
3901 echo "Inject failure stub on client to simulate the case that"
3902 echo "some name entry should be inserted into other non-first"
3903 echo "shard, but inserted into the first shard by wrong"
3905 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3906 $LCTL set_param fail_loc=0x1628 fail_val=0
3907 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3908 error "(2) Fail to create file under striped directory"
3909 $LCTL set_param fail_loc=0 fail_val=0
3911 echo "Trigger namespace LFSCK to repair bad name hash"
3912 $START_NAMESPACE -r -A ||
3913 error "(3) Fail to start LFSCK for namespace"
3915 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3916 mdd.${MDT_DEV}.lfsck_namespace |
3917 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3919 error "(4) unexpected status"
3922 local repaired=$($SHOW_NAMESPACE |
3923 awk '/^name_hash_repaired/ { print $2 }')
3924 [ $repaired -ge 1 ] ||
3925 error "(5) Fail to repair bad name hash: $repaired"
3927 umount_client $MOUNT || error "(6) umount failed"
3928 mount_client $MOUNT || error "(7) mount failed"
3930 for ((i = 0; i < $MDSCOUNT; i++)); do
3931 stat $DIR/$tdir/striped_dir/d$i ||
3932 error "(8) Fail to stat d$i after LFSCK"
3933 rmdir $DIR/$tdir/striped_dir/d$i ||
3934 error "(9) Fail to unlink d$i after LFSCK"
3937 rmdir $DIR/$tdir/striped_dir ||
3938 error "(10) Fail to remove the striped directory after LFSCK"
3940 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3943 [ $MDSCOUNT -lt 2 ] &&
3944 skip "The test needs at least 2 MDTs" && return
3947 echo "For the name entry under a striped directory, if the name"
3948 echo "hash does not match the shard, then the LFSCK will repair"
3949 echo "the bad name entry"
3952 check_mount_and_prep
3954 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3955 error "(1) Fail to create striped directory"
3957 echo "Inject failure stub on client to simulate the case that"
3958 echo "some name entry should be inserted into other non-second"
3959 echo "shard, but inserted into the secod shard by wrong"
3961 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3962 $LCTL set_param fail_loc=0x1628 fail_val=1
3963 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3964 error "(2) Fail to create file under striped directory"
3965 $LCTL set_param fail_loc=0 fail_val=0
3967 echo "Trigger namespace LFSCK to repair bad name hash"
3968 $START_NAMESPACE -r -A ||
3969 error "(3) Fail to start LFSCK for namespace"
3971 wait_update_facet mds2 "$LCTL get_param -n \
3972 mdd.$(facet_svc mds2).lfsck_namespace |
3973 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3974 error "(4) unexpected status"
3976 local repaired=$(do_facet mds2 $LCTL get_param -n \
3977 mdd.$(facet_svc mds2).lfsck_namespace |
3978 awk '/^name_hash_repaired/ { print $2 }')
3979 [ $repaired -ge 1 ] ||
3980 error "(5) Fail to repair bad name hash: $repaired"
3982 umount_client $MOUNT || error "(6) umount failed"
3983 mount_client $MOUNT || error "(7) mount failed"
3985 for ((i = 0; i < $MDSCOUNT; i++)); do
3986 stat $DIR/$tdir/striped_dir/d$i ||
3987 error "(8) Fail to stat d$i after LFSCK"
3988 rmdir $DIR/$tdir/striped_dir/d$i ||
3989 error "(9) Fail to unlink d$i after LFSCK"
3992 rmdir $DIR/$tdir/striped_dir ||
3993 error "(10) Fail to remove the striped directory after LFSCK"
3995 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3998 [ $MDSCOUNT -lt 2 ] &&
3999 skip "The test needs at least 2 MDTs" && return
4002 echo "For some reason, the master MDT-object of the striped directory"
4003 echo "may lost its master LMV EA. If nobody created files under the"
4004 echo "master directly after the master LMV EA lost, then the LFSCK"
4005 echo "should re-generate the master LMV EA."
4008 check_mount_and_prep
4010 echo "Inject failure stub on MDT0 to simulate the case that the"
4011 echo "master MDT-object of the striped directory lost the LMV EA."
4013 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4014 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4015 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4016 error "(1) Fail to create striped directory"
4017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4019 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4020 $START_NAMESPACE -r -A ||
4021 error "(2) Fail to start LFSCK for namespace"
4023 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4024 mdd.${MDT_DEV}.lfsck_namespace |
4025 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4027 error "(3) unexpected status"
4030 local repaired=$($SHOW_NAMESPACE |
4031 awk '/^striped_dirs_repaired/ { print $2 }')
4032 [ $repaired -eq 1 ] ||
4033 error "(4) Fail to re-generate master LMV EA: $repaired"
4035 umount_client $MOUNT || error "(5) umount failed"
4036 mount_client $MOUNT || error "(6) mount failed"
4038 local empty=$(ls $DIR/$tdir/striped_dir/)
4039 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4041 rmdir $DIR/$tdir/striped_dir ||
4042 error "(8) Fail to remove the striped directory after LFSCK"
4044 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4047 [ $MDSCOUNT -lt 2 ] &&
4048 skip "The test needs at least 2 MDTs" && return
4051 echo "For some reason, the master MDT-object of the striped directory"
4052 echo "may lost its master LMV EA. If somebody created files under the"
4053 echo "master directly after the master LMV EA lost, then the LFSCK"
4054 echo "should NOT re-generate the master LMV EA, instead, it should"
4055 echo "change the broken striped dirctory as read-only to prevent"
4056 echo "further damage"
4059 check_mount_and_prep
4061 echo "Inject failure stub on MDT0 to simulate the case that the"
4062 echo "master MDT-object of the striped directory lost the LMV EA."
4064 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4066 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4067 error "(1) Fail to create striped directory"
4068 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4070 umount_client $MOUNT || error "(2) umount failed"
4071 mount_client $MOUNT || error "(3) mount failed"
4073 touch $DIR/$tdir/striped_dir/dummy ||
4074 error "(4) Fail to touch under broken striped directory"
4076 echo "Trigger namespace LFSCK to find out the inconsistency"
4077 $START_NAMESPACE -r -A ||
4078 error "(5) Fail to start LFSCK for namespace"
4080 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4081 mdd.${MDT_DEV}.lfsck_namespace |
4082 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4084 error "(6) unexpected status"
4087 local repaired=$($SHOW_NAMESPACE |
4088 awk '/^striped_dirs_repaired/ { print $2 }')
4089 [ $repaired -eq 0 ] ||
4090 error "(7) Re-generate master LMV EA unexpected: $repaired"
4092 stat $DIR/$tdir/striped_dir/dummy ||
4093 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4095 touch $DIR/$tdir/striped_dir/foo &&
4096 error "(9) The broken striped directory should be read-only"
4098 chattr -i $DIR/$tdir/striped_dir ||
4099 error "(10) Fail to chattr on the broken striped directory"
4101 rmdir $DIR/$tdir/striped_dir ||
4102 error "(11) Fail to remove the striped directory after LFSCK"
4104 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4107 [ $MDSCOUNT -lt 2 ] &&
4108 skip "The test needs at least 2 MDTs" && return
4111 echo "For some reason, the slave MDT-object of the striped directory"
4112 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4113 echo "slave LMV EA."
4116 check_mount_and_prep
4118 echo "Inject failure stub on MDT0 to simulate the case that the"
4119 echo "slave MDT-object (that resides on the same MDT as the master"
4120 echo "MDT-object resides on) lost the LMV EA."
4122 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4124 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4125 error "(1) Fail to create striped directory"
4126 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4128 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4129 $START_NAMESPACE -r -A ||
4130 error "(2) Fail to start LFSCK for namespace"
4132 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4133 mdd.${MDT_DEV}.lfsck_namespace |
4134 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4136 error "(3) unexpected status"
4139 local repaired=$($SHOW_NAMESPACE |
4140 awk '/^striped_shards_repaired/ { print $2 }')
4141 [ $repaired -eq 1 ] ||
4142 error "(4) Fail to re-generate slave LMV EA: $repaired"
4144 rmdir $DIR/$tdir/striped_dir ||
4145 error "(5) Fail to remove the striped directory after LFSCK"
4147 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4150 [ $MDSCOUNT -lt 2 ] &&
4151 skip "The test needs at least 2 MDTs" && return
4154 echo "For some reason, the slave MDT-object of the striped directory"
4155 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4156 echo "slave LMV EA."
4159 check_mount_and_prep
4161 echo "Inject failure stub on MDT0 to simulate the case that the"
4162 echo "slave MDT-object (that resides on different MDT as the master"
4163 echo "MDT-object resides on) lost the LMV EA."
4165 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4166 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4167 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4168 error "(1) Fail to create striped directory"
4169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4171 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4172 $START_NAMESPACE -r -A ||
4173 error "(2) Fail to start LFSCK for namespace"
4175 wait_update_facet mds2 "$LCTL get_param -n \
4176 mdd.$(facet_svc mds2).lfsck_namespace |
4177 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4178 error "(3) unexpected status"
4180 local repaired=$(do_facet mds2 $LCTL get_param -n \
4181 mdd.$(facet_svc mds2).lfsck_namespace |
4182 awk '/^striped_shards_repaired/ { print $2 }')
4183 [ $repaired -eq 1 ] ||
4184 error "(4) Fail to re-generate slave LMV EA: $repaired"
4186 rmdir $DIR/$tdir/striped_dir ||
4187 error "(5) Fail to remove the striped directory after LFSCK"
4189 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4192 [ $MDSCOUNT -lt 2 ] &&
4193 skip "The test needs at least 2 MDTs" && return
4196 echo "For some reason, the stripe index in the slave LMV EA is"
4197 echo "corrupted. The LFSCK should repair the slave LMV EA."
4200 check_mount_and_prep
4202 echo "Inject failure stub on MDT0 to simulate the case that the"
4203 echo "slave LMV EA on the first shard of the striped directory"
4204 echo "claims the same index as the second shard claims"
4206 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4208 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4209 error "(1) Fail to create striped directory"
4210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4212 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4213 $START_NAMESPACE -r -A ||
4214 error "(2) Fail to start LFSCK for namespace"
4216 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4217 mdd.${MDT_DEV}.lfsck_namespace |
4218 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4220 error "(3) unexpected status"
4223 local repaired=$($SHOW_NAMESPACE |
4224 awk '/^striped_shards_repaired/ { print $2 }')
4225 [ $repaired -eq 1 ] ||
4226 error "(4) Fail to repair slave LMV EA: $repaired"
4228 umount_client $MOUNT || error "(5) umount failed"
4229 mount_client $MOUNT || error "(6) mount failed"
4231 touch $DIR/$tdir/striped_dir/foo ||
4232 error "(7) Fail to touch file after the LFSCK"
4234 rm -f $DIR/$tdir/striped_dir/foo ||
4235 error "(8) Fail to unlink file after the LFSCK"
4237 rmdir $DIR/$tdir/striped_dir ||
4238 error "(9) Fail to remove the striped directory after LFSCK"
4240 run_test 31g "Repair the corrupted slave LMV EA"
4243 [ $MDSCOUNT -lt 2 ] &&
4244 skip "The test needs at least 2 MDTs" && return
4247 echo "For some reason, the shard's name entry in the striped"
4248 echo "directory may be corrupted. The LFSCK should repair the"
4249 echo "bad shard's name entry."
4252 check_mount_and_prep
4254 echo "Inject failure stub on MDT0 to simulate the case that the"
4255 echo "first shard's name entry in the striped directory claims"
4256 echo "the same index as the second shard's name entry claims."
4258 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4259 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4260 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4261 error "(1) Fail to create striped directory"
4262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4264 echo "Trigger namespace LFSCK to repair the shard's name entry"
4265 $START_NAMESPACE -r -A ||
4266 error "(2) Fail to start LFSCK for namespace"
4268 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4269 mdd.${MDT_DEV}.lfsck_namespace |
4270 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4272 error "(3) unexpected status"
4275 local repaired=$($SHOW_NAMESPACE |
4276 awk '/^dirent_repaired/ { print $2 }')
4277 [ $repaired -eq 1 ] ||
4278 error "(4) Fail to repair shard's name entry: $repaired"
4280 umount_client $MOUNT || error "(5) umount failed"
4281 mount_client $MOUNT || error "(6) mount failed"
4283 touch $DIR/$tdir/striped_dir/foo ||
4284 error "(7) Fail to touch file after the LFSCK"
4286 rm -f $DIR/$tdir/striped_dir/foo ||
4287 error "(8) Fail to unlink file after the LFSCK"
4289 rmdir $DIR/$tdir/striped_dir ||
4290 error "(9) Fail to remove the striped directory after LFSCK"
4292 run_test 31h "Repair the corrupted shard's name entry"
4297 umount_client $MOUNT
4299 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4300 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4301 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4303 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4304 [ "$STATUS" == "scanning-phase1" ] ||
4305 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4308 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4314 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
4316 run_test 32 "stop LFSCK when some OST failed"
4318 # restore MDS/OST size
4319 MDSSIZE=${SAVED_MDSSIZE}
4320 OSTSIZE=${SAVED_OSTSIZE}
4321 OSTCOUNT=${SAVED_OSTCOUNT}
4323 # cleanup the system at last