3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
24 SAVED_MDSSIZE=${MDSSIZE}
25 SAVED_OSTSIZE=${OSTSIZE}
26 SAVED_OSTCOUNT=${OSTCOUNT}
27 # use small MDS + OST size to speed formatting time
28 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
31 # no need too many OSTs, to reduce the format/start/stop overhead
32 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
34 # build up a clean test environment.
38 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
39 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
42 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
43 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
45 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
51 # DNE does not support striped directory on zfs-based backend yet.
52 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
53 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
57 MDT_DEV="${FSNAME}-MDT0000"
58 OST_DEV="${FSNAME}-OST0000"
59 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
60 START_NAMESPACE="do_facet $SINGLEMDS \
61 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
62 START_LAYOUT="do_facet $SINGLEMDS \
63 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
64 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
65 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
66 SHOW_NAMESPACE="do_facet $SINGLEMDS \
67 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
68 SHOW_LAYOUT="do_facet $SINGLEMDS \
69 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
70 SHOW_LAYOUT_ON_OST="do_facet ost1 \
71 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
72 MOUNT_OPTS_SCRUB="-o user_xattr"
73 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
82 echo "preparing... $nfiles * $ndirs files will be created $(date)."
83 if [ ! -z $igif ]; then
84 #define OBD_FAIL_FID_IGIF 0x1504
85 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
88 cp $LUSTRE/tests/*.sh $DIR/$tdir/
89 if [ $ndirs -gt 0 ]; then
90 createmany -d $DIR/$tdir/d $ndirs
91 createmany -m $DIR/$tdir/f $ndirs
92 if [ $nfiles -gt 0 ]; then
93 for ((i = 0; i < $ndirs; i++)); do
94 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
95 /dev/null || error "createmany $nfiles"
98 createmany -d $DIR/$tdir/e $ndirs
101 if [ ! -z $igif ]; then
102 touch $DIR/$tdir/dummy
103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
106 echo "prepared $(date)."
112 #define OBD_FAIL_LFSCK_DELAY1 0x1600
113 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
114 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
116 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
118 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
119 [ "$STATUS" == "scanning-phase1" ] ||
120 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
122 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
124 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
125 [ "$STATUS" == "stopped" ] ||
126 error "(6) Expect 'stopped', but got '$STATUS'"
128 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
130 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
131 [ "$STATUS" == "scanning-phase1" ] ||
132 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
134 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
135 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
136 mdd.${MDT_DEV}.lfsck_namespace |
137 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
139 error "(9) unexpected status"
142 local repaired=$($SHOW_NAMESPACE |
143 awk '/^updated_phase1/ { print $2 }')
144 [ $repaired -eq 0 ] ||
145 error "(10) Expect nothing to be repaired, but got: $repaired"
147 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
148 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
149 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
150 mdd.${MDT_DEV}.lfsck_namespace |
151 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
153 error "(12) unexpected status"
156 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
157 [ $((scanned1 + 1)) -eq $scanned2 ] ||
158 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
160 echo "stopall, should NOT crash LU-3649"
161 stopall || error "(14) Fail to stopall"
163 run_test 0 "Control LFSCK manually"
166 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
167 skip "OI Scrub not implemented for ZFS" && return
171 #define OBD_FAIL_FID_INDIR 0x1501
172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
173 touch $DIR/$tdir/dummy
175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
177 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
178 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
179 mdd.${MDT_DEV}.lfsck_namespace |
180 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
182 error "(4) unexpected status"
185 local repaired=$($SHOW_NAMESPACE |
186 awk '/^dirent_repaired/ { print $2 }')
187 # for interop with old server
188 [ -z "$repaired" ] &&
189 repaired=$($SHOW_NAMESPACE |
190 awk '/^updated_phase1/ { print $2 }')
192 [ $repaired -eq 1 ] ||
193 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
195 mount_client $MOUNT || error "(6) Fail to start client!"
197 #define OBD_FAIL_FID_LOOKUP 0x1505
198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
199 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
201 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
203 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
207 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
208 skip "OI Scrub not implemented for ZFS" && return
212 #define OBD_FAIL_FID_INLMA 0x1502
213 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
214 touch $DIR/$tdir/dummy
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
218 #define OBD_FAIL_FID_NOLMA 0x1506
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
220 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
221 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
222 mdd.${MDT_DEV}.lfsck_namespace |
223 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
225 error "(4) unexpected status"
228 local repaired=$($SHOW_NAMESPACE |
229 awk '/^dirent_repaired/ { print $2 }')
230 # for interop with old server
231 [ -z "$repaired" ] &&
232 repaired=$($SHOW_NAMESPACE |
233 awk '/^updated_phase1/ { print $2 }')
235 [ $repaired -eq 1 ] ||
236 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
239 mount_client $MOUNT || error "(6) Fail to start client!"
241 #define OBD_FAIL_FID_LOOKUP 0x1505
242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
243 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
247 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
252 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
254 touch $DIR/$tdir/dummy
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
258 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
259 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
260 mdd.${MDT_DEV}.lfsck_namespace |
261 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
263 error "(4) unexpected status"
266 local repaired=$($SHOW_NAMESPACE |
267 awk '/^linkea_repaired/ { print $2 }')
268 # for interop with old server
269 [ -z "$repaired" ] &&
270 repaired=$($SHOW_NAMESPACE |
271 awk '/^updated_phase2/ { print $2 }')
273 [ $repaired -eq 1 ] ||
274 error "(5) Fail to repair crashed linkEA: $repaired"
276 mount_client $MOUNT || error "(6) Fail to start client!"
278 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
279 error "(7) Fail to stat $DIR/$tdir/dummy"
281 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
282 local dummyname=$($LFS fid2path $DIR $dummyfid)
283 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
284 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
286 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
292 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
293 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
294 touch $DIR/$tdir/dummy
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
298 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
299 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
300 mdd.${MDT_DEV}.lfsck_namespace |
301 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
303 error "(4) unexpected status"
306 local repaired=$($SHOW_NAMESPACE |
307 awk '/^updated_phase2/ { print $2 }')
308 [ $repaired -eq 1 ] ||
309 error "(5) Fail to repair crashed linkEA: $repaired"
311 mount_client $MOUNT || error "(6) Fail to start client!"
313 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
314 error "(7) Fail to stat $DIR/$tdir/dummy"
316 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
317 local dummyname=$($LFS fid2path $DIR $dummyfid)
318 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
319 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
321 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
327 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
328 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
329 touch $DIR/$tdir/dummy
331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
333 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
334 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
335 mdd.${MDT_DEV}.lfsck_namespace |
336 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
338 error "(4) unexpected status"
341 local repaired=$($SHOW_NAMESPACE |
342 awk '/^updated_phase2/ { print $2 }')
343 [ $repaired -eq 1 ] ||
344 error "(5) Fail to repair crashed linkEA: $repaired"
346 mount_client $MOUNT || error "(6) Fail to start client!"
348 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
349 error "(7) Fail to stat $DIR/$tdir/dummy"
351 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
352 local dummyname=$($LFS fid2path $DIR $dummyfid)
353 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
354 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
356 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
362 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
364 touch $DIR/$tdir/dummy
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
368 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
369 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
370 mdd.${MDT_DEV}.lfsck_namespace |
371 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
373 error "(4) unexpected status"
376 local repaired=$($SHOW_NAMESPACE |
377 awk '/^linkea_repaired/ { print $2 }')
378 [ $repaired -eq 1 ] ||
379 error "(5) Fail to repair crashed linkEA: $repaired"
381 mount_client $MOUNT || error "(6) Fail to start client!"
383 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
384 error "(7) Fail to stat $DIR/$tdir/dummy"
386 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
387 local dummyname=$($LFS fid2path $DIR $dummyfid)
388 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
389 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
391 run_test 2d "LFSCK can recover the missing linkEA entry"
395 [ $MDSCOUNT -lt 2 ] &&
396 skip "We need at least 2 MDSes for this test" && return
400 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
402 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
404 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
407 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
408 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
409 mdd.${MDT_DEV}.lfsck_namespace |
410 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
412 error "(4) unexpected status"
415 local repaired=$($SHOW_NAMESPACE |
416 awk '/^linkea_repaired/ { print $2 }')
417 [ $repaired -eq 1 ] ||
418 error "(5) Fail to repair crashed linkEA: $repaired"
420 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
421 local name=$($LFS fid2path $DIR $fid)
422 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
423 error "(6) Fail to repair linkEA: $fid $name"
425 run_test 2e "namespace LFSCK can verify remote object linkEA"
431 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
432 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
433 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
435 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
436 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
437 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
439 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
441 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
443 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
445 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
449 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
450 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
451 mdd.${MDT_DEV}.lfsck_namespace |
452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
454 error "(10) unexpected status"
457 local checked=$($SHOW_NAMESPACE |
458 awk '/^checked_phase2/ { print $2 }')
459 [ $checked -ge 4 ] ||
460 error "(11) Fail to check multiple-linked object: $checked"
462 local repaired=$($SHOW_NAMESPACE |
463 awk '/^multiple_linked_repaired/ { print $2 }')
464 [ $repaired -ge 2 ] ||
465 error "(12) Fail to repair multiple-linked object: $repaired"
467 run_test 3 "LFSCK can verify multiple-linked objects"
471 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
472 skip "OI Scrub not implemented for ZFS" && return
475 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
476 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
478 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
479 echo "start $SINGLEMDS with disabling OI scrub"
480 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
481 error "(2) Fail to start MDS!"
483 #define OBD_FAIL_LFSCK_DELAY2 0x1601
484 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
485 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
486 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
487 mdd.${MDT_DEV}.lfsck_namespace |
488 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
490 error "(5) unexpected status"
493 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
494 [ "$STATUS" == "scanning-phase1" ] ||
495 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
498 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
499 mdd.${MDT_DEV}.lfsck_namespace |
500 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
502 error "(7) unexpected status"
505 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
506 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
508 local repaired=$($SHOW_NAMESPACE |
509 awk '/^dirent_repaired/ { print $2 }')
510 # for interop with old server
511 [ -z "$repaired" ] &&
512 repaired=$($SHOW_NAMESPACE |
513 awk '/^updated_phase1/ { print $2 }')
515 [ $repaired -ge 9 ] ||
516 error "(9) Fail to re-generate FID-in-dirent: $repaired"
518 mount_client $MOUNT || error "(10) Fail to start client!"
520 #define OBD_FAIL_FID_LOOKUP 0x1505
521 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
522 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
525 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 2 ] ||
574 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
576 mount_client $MOUNT || error "(10) Fail to start client!"
578 #define OBD_FAIL_FID_LOOKUP 0x1505
579 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
580 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
582 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
586 local dummyname=$($LFS fid2path $DIR $dummyfid)
587 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
588 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
590 run_test 5 "LFSCK can handle IGIF object upgrading"
595 #define OBD_FAIL_LFSCK_DELAY1 0x1600
596 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
597 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
599 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
600 [ "$STATUS" == "scanning-phase1" ] ||
601 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
603 # Sleep 3 sec to guarantee at least one object processed by LFSCK
605 # Fail the LFSCK to guarantee there is at least one checkpoint
606 #define OBD_FAIL_LFSCK_FATAL1 0x1608
607 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
608 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
609 mdd.${MDT_DEV}.lfsck_namespace |
610 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
612 error "(4) unexpected status"
615 local POS0=$($SHOW_NAMESPACE |
616 awk '/^last_checkpoint_position/ { print $2 }' |
619 #define OBD_FAIL_LFSCK_DELAY1 0x1600
620 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
621 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
623 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
624 [ "$STATUS" == "scanning-phase1" ] ||
625 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
627 local POS1=$($SHOW_NAMESPACE |
628 awk '/^latest_start_position/ { print $2 }' |
630 [[ $POS0 -lt $POS1 ]] ||
631 error "(7) Expect larger than: $POS0, but got $POS1"
633 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
634 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
635 mdd.${MDT_DEV}.lfsck_namespace |
636 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
638 error "(8) unexpected status"
641 run_test 6a "LFSCK resumes from last checkpoint (1)"
646 #define OBD_FAIL_LFSCK_DELAY2 0x1601
647 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
648 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
650 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
651 [ "$STATUS" == "scanning-phase1" ] ||
652 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
654 # Sleep 5 sec to guarantee that we are in the directory scanning
656 # Fail the LFSCK to guarantee there is at least one checkpoint
657 #define OBD_FAIL_LFSCK_FATAL2 0x1609
658 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
659 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
660 mdd.${MDT_DEV}.lfsck_namespace |
661 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
663 error "(4) unexpected status"
666 local O_POS0=$($SHOW_NAMESPACE |
667 awk '/^last_checkpoint_position/ { print $2 }' |
670 local D_POS0=$($SHOW_NAMESPACE |
671 awk '/^last_checkpoint_position/ { print $4 }')
673 #define OBD_FAIL_LFSCK_DELAY2 0x1601
674 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
675 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
677 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
678 [ "$STATUS" == "scanning-phase1" ] ||
679 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
681 local O_POS1=$($SHOW_NAMESPACE |
682 awk '/^latest_start_position/ { print $2 }' |
684 local D_POS1=$($SHOW_NAMESPACE |
685 awk '/^latest_start_position/ { print $4 }')
687 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
688 [[ $O_POS0 -lt $O_POS1 ]] ||
689 error "(7.1) $O_POS1 is not larger than $O_POS0"
691 [[ $D_POS0 -lt $D_POS1 ]] ||
692 error "(7.2) $D_POS1 is not larger than $D_POS0"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6b "LFSCK resumes from last checkpoint (2)"
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
714 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
715 [ "$STATUS" == "scanning-phase1" ] ||
716 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
718 # Sleep 3 sec to guarantee at least one object processed by LFSCK
720 echo "stop $SINGLEMDS"
721 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
724 echo "start $SINGLEMDS"
725 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
726 error "(5) Fail to start MDS!"
728 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
729 mdd.${MDT_DEV}.lfsck_namespace |
730 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
732 error "(6) unexpected status"
735 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
741 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
742 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
743 for ((i = 0; i < 20; i++)); do
744 touch $DIR/$tdir/dummy${i}
747 #define OBD_FAIL_LFSCK_DELAY3 0x1602
748 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
749 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
750 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
751 mdd.${MDT_DEV}.lfsck_namespace |
752 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
754 error "(4) unexpected status"
758 echo "stop $SINGLEMDS"
759 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
762 echo "start $SINGLEMDS"
763 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
764 error "(6) Fail to start MDS!"
766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
767 mdd.${MDT_DEV}.lfsck_namespace |
768 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
770 error "(7) unexpected status"
773 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
778 formatall > /dev/null
784 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
785 [ "$STATUS" == "init" ] ||
786 error "(2) Expect 'init', but got '$STATUS'"
788 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
789 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
790 mkdir $DIR/$tdir/crashed
792 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
794 for ((i = 0; i < 5; i++)); do
795 touch $DIR/$tdir/dummy${i}
798 umount_client $MOUNT || error "(3) Fail to stop client!"
800 #define OBD_FAIL_LFSCK_DELAY2 0x1601
801 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
802 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
804 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
805 [ "$STATUS" == "scanning-phase1" ] ||
806 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
808 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
810 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "stopped" ] ||
812 error "(7) Expect 'stopped', but got '$STATUS'"
814 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
816 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
817 [ "$STATUS" == "scanning-phase1" ] ||
818 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
820 #define OBD_FAIL_LFSCK_FATAL2 0x1609
821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
822 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
823 mdd.${MDT_DEV}.lfsck_namespace |
824 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
826 error "(10) unexpected status"
829 #define OBD_FAIL_LFSCK_DELAY1 0x1600
830 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
831 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
833 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
834 [ "$STATUS" == "scanning-phase1" ] ||
835 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
837 #define OBD_FAIL_LFSCK_CRASH 0x160a
838 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
841 echo "stop $SINGLEMDS"
842 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
844 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
847 echo "start $SINGLEMDS"
848 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
849 error "(14) Fail to start MDS!"
851 local timeout=$(max_recovery_time)
854 while [ $timer -lt $timeout ]; do
855 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
856 mdt.${MDT_DEV}.recovery_status |
857 awk '/^status/ { print \\\$2 }'")
858 [ "$STATUS" != "RECOVERING" ] && break;
863 [ $timer != $timeout ] ||
864 error "(14.1) recovery timeout"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "crashed" ] ||
868 error "(15) Expect 'crashed', but got '$STATUS'"
870 #define OBD_FAIL_LFSCK_DELAY2 0x1601
871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
872 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
874 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
875 [ "$STATUS" == "scanning-phase1" ] ||
876 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
878 echo "stop $SINGLEMDS"
879 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
881 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
884 echo "start $SINGLEMDS"
885 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
886 error "(19) Fail to start MDS!"
889 while [ $timer -lt $timeout ]; do
890 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
891 mdt.${MDT_DEV}.recovery_status |
892 awk '/^status/ { print \\\$2 }'")
893 [ "$STATUS" != "RECOVERING" ] && break;
898 [ $timer != $timeout ] ||
899 error "(19.1) recovery timeout"
901 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
902 [ "$STATUS" == "paused" ] ||
903 error "(20) Expect 'paused', but got '$STATUS'"
905 #define OBD_FAIL_LFSCK_DELAY3 0x1602
906 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
908 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
909 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
910 mdd.${MDT_DEV}.lfsck_namespace |
911 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
913 error "(22) unexpected status"
916 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
917 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
918 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
920 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
921 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
922 mdd.${MDT_DEV}.lfsck_namespace |
923 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
925 error "(24) unexpected status"
928 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
929 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
931 run_test 8 "LFSCK state machine"
934 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
935 skip "Testing on UP system, the speed may be inaccurate."
941 local BASE_SPEED1=100
943 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
946 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
947 [ "$STATUS" == "scanning-phase1" ] ||
948 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
950 local SPEED=$($SHOW_NAMESPACE |
951 awk '/^average_speed_phase1/ { print $2 }')
953 # There may be time error, normally it should be less than 2 seconds.
954 # We allow another 20% schedule error.
956 # MAX_MARGIN = 1.2 = 12 / 10
957 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
958 RUN_TIME1 * 12 / 10))
959 [ $SPEED -lt $MAX_SPEED ] ||
960 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
963 local BASE_SPEED2=300
965 do_facet $SINGLEMDS \
966 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
969 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
970 # MIN_MARGIN = 0.8 = 8 / 10
971 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
972 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
973 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
974 [ $SPEED -gt $MIN_SPEED ] || {
975 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
976 error_ignore LU-5624 \
977 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
980 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
984 # MAX_MARGIN = 1.2 = 12 / 10
985 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
986 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
987 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
988 [ $SPEED -lt $MAX_SPEED ] ||
989 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
991 do_facet $SINGLEMDS \
992 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
994 wait_update_facet $SINGLEMDS \
995 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
996 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
997 error "(7) Failed to get expected 'completed'"
999 run_test 9a "LFSCK speed control (1)"
1002 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1003 skip "Testing on UP system, the speed may be inaccurate."
1009 echo "Preparing another 50 * 50 files (with error) at $(date)."
1010 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1011 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1012 createmany -d $DIR/$tdir/d 50
1013 createmany -m $DIR/$tdir/f 50
1014 for ((i = 0; i < 50; i++)); do
1015 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1018 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1020 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1021 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1022 mdd.${MDT_DEV}.lfsck_namespace |
1023 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1025 error "(5) unexpected status"
1028 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1029 echo "Prepared at $(date)."
1031 local BASE_SPEED1=50
1033 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1036 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1037 [ "$STATUS" == "scanning-phase2" ] ||
1038 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1040 local SPEED=$($SHOW_NAMESPACE |
1041 awk '/^average_speed_phase2/ { print $2 }')
1042 # There may be time error, normally it should be less than 2 seconds.
1043 # We allow another 20% schedule error.
1045 # MAX_MARGIN = 1.2 = 12 / 10
1046 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1047 RUN_TIME1 * 12 / 10))
1048 [ $SPEED -lt $MAX_SPEED ] ||
1049 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1051 # adjust speed limit
1052 local BASE_SPEED2=150
1054 do_facet $SINGLEMDS \
1055 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1058 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1059 # MIN_MARGIN = 0.8 = 8 / 10
1060 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1061 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1062 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1063 [ $SPEED -gt $MIN_SPEED ] || {
1064 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1065 error_ignore LU-5624 \
1066 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1069 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1073 # MAX_MARGIN = 1.2 = 12 / 10
1074 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1075 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1076 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1077 [ $SPEED -lt $MAX_SPEED ] ||
1078 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1080 do_facet $SINGLEMDS \
1081 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1082 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1083 mdd.${MDT_DEV}.lfsck_namespace |
1084 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1086 error "(11) unexpected status"
1089 run_test 9b "LFSCK speed control (2)"
1093 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1094 skip "lookup(..)/linkea on ZFS issue" && return
1098 echo "Preparing more files with error at $(date)."
1099 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1102 for ((i = 0; i < 1000; i = $((i+2)))); do
1103 mkdir -p $DIR/$tdir/d${i}
1104 touch $DIR/$tdir/f${i}
1105 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1108 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1109 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1111 for ((i = 1; i < 1000; i = $((i+2)))); do
1112 mkdir -p $DIR/$tdir/d${i}
1113 touch $DIR/$tdir/f${i}
1114 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1117 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1118 echo "Prepared at $(date)."
1120 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1122 umount_client $MOUNT
1123 mount_client $MOUNT || error "(3) Fail to start client!"
1125 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1128 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1129 [ "$STATUS" == "scanning-phase1" ] ||
1130 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1132 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1134 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1136 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1138 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1140 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1142 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1144 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1146 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1147 error "(14) Fail to softlink!"
1149 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1150 [ "$STATUS" == "scanning-phase1" ] ||
1151 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1153 do_facet $SINGLEMDS \
1154 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1155 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1156 mdd.${MDT_DEV}.lfsck_namespace |
1157 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1159 error "(16) unexpected status"
1162 run_test 10 "System is available during LFSCK scanning"
1165 ost_remove_lastid() {
1168 local rcmd="do_facet ost${ost}"
1170 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1172 # step 1: local mount
1173 mount_fstype ost${ost} || return 1
1174 # step 2: remove the specified LAST_ID
1175 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1177 unmount_fstype ost${ost} || return 2
1181 check_mount_and_prep
1182 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1183 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1188 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1190 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1191 error "(2) Fail to start ost1"
1193 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1194 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1196 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1197 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1199 wait_update_facet ost1 "$LCTL get_param -n \
1200 obdfilter.${OST_DEV}.lfsck_layout |
1201 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1203 error "(5) unexpected status"
1206 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1208 wait_update_facet ost1 "$LCTL get_param -n \
1209 obdfilter.${OST_DEV}.lfsck_layout |
1210 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1212 error "(6) unexpected status"
1215 echo "the LAST_ID(s) should have been rebuilt"
1216 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1217 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1219 run_test 11a "LFSCK can rebuild lost last_id"
1222 check_mount_and_prep
1223 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1225 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1226 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1227 do_facet ost1 $LCTL set_param fail_loc=0x160d
1229 local count=$(precreated_ost_obj_count 0 0)
1231 createmany -o $DIR/$tdir/f $((count + 32))
1233 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1234 local seq=$(do_facet mds1 $LCTL get_param -n \
1235 osp.${proc_path}.prealloc_last_seq)
1236 local lastid1=$(do_facet ost1 "lctl get_param -n \
1237 obdfilter.${ost1_svc}.last_id" | grep $seq |
1238 awk -F: '{ print $2 }')
1240 umount_client $MOUNT
1241 stop ost1 || error "(1) Fail to stop ost1"
1243 #define OBD_FAIL_OST_ENOSPC 0x215
1244 do_facet ost1 $LCTL set_param fail_loc=0x215
1246 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1247 error "(2) Fail to start ost1"
1249 for ((i = 0; i < 60; i++)); do
1250 lastid2=$(do_facet ost1 "lctl get_param -n \
1251 obdfilter.${ost1_svc}.last_id" | grep $seq |
1252 awk -F: '{ print $2 }')
1253 [ ! -z $lastid2 ] && break;
1257 echo "the on-disk LAST_ID should be smaller than the expected one"
1258 [ $lastid1 -gt $lastid2 ] ||
1259 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1261 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1262 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1264 wait_update_facet ost1 "$LCTL get_param -n \
1265 obdfilter.${OST_DEV}.lfsck_layout |
1266 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1268 error "(6) unexpected status"
1271 stop ost1 || error "(7) Fail to stop ost1"
1273 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1274 error "(8) Fail to start ost1"
1276 echo "the on-disk LAST_ID should have been rebuilt"
1277 wait_update_facet ost1 "$LCTL get_param -n \
1278 obdfilter.${ost1_svc}.last_id | grep $seq |
1279 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1280 do_facet ost1 $LCTL get_param -n \
1281 obdfilter.${ost1_svc}.last_id
1282 error "(9) expect lastid1 $seq:$lastid1"
1285 do_facet ost1 $LCTL set_param fail_loc=0
1286 stopall || error "(10) Fail to stopall"
1288 run_test 11b "LFSCK can rebuild crashed last_id"
1291 [ $MDSCOUNT -lt 2 ] &&
1292 skip "We need at least 2 MDSes for test_12" && return
1294 check_mount_and_prep
1295 for k in $(seq $MDSCOUNT); do
1296 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1297 createmany -o $DIR/$tdir/${k}/f 100 ||
1298 error "(0) Fail to create 100 files."
1301 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1302 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1303 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1305 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1306 for k in $(seq $MDSCOUNT); do
1307 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1308 mdd.$(facet_svc mds${k}).lfsck_namespace |
1309 awk '/^status/ { print $2 }')
1310 [ "$STATUS" == "scanning-phase1" ] ||
1311 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1314 echo "Stop namespace LFSCK on all targets by single lctl command."
1315 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1316 error "(4) Fail to stop LFSCK on all devices!"
1318 echo "All the LFSCK targets should be in 'stopped' status."
1319 for k in $(seq $MDSCOUNT); do
1320 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1321 mdd.$(facet_svc mds${k}).lfsck_namespace |
1322 awk '/^status/ { print $2 }')
1323 [ "$STATUS" == "stopped" ] ||
1324 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1327 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1328 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1329 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1331 echo "All the LFSCK targets should be in 'completed' status."
1332 for k in $(seq $MDSCOUNT); do
1333 wait_update_facet mds${k} "$LCTL get_param -n \
1334 mdd.$(facet_svc mds${k}).lfsck_namespace |
1335 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1336 error "(7) MDS${k} is not the expected 'completed'"
1339 start_full_debug_logging
1341 echo "Start layout LFSCK on all targets by single command (-s 1)."
1342 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1343 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1345 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1346 for k in $(seq $MDSCOUNT); do
1347 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1348 mdd.$(facet_svc mds${k}).lfsck_layout |
1349 awk '/^status/ { print $2 }')
1350 [ "$STATUS" == "scanning-phase1" ] ||
1351 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1354 echo "Stop layout LFSCK on all targets by single lctl command."
1355 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1356 error "(10) Fail to stop LFSCK on all devices!"
1358 echo "All the LFSCK targets should be in 'stopped' status."
1359 for k in $(seq $MDSCOUNT); do
1360 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1361 mdd.$(facet_svc mds${k}).lfsck_layout |
1362 awk '/^status/ { print $2 }')
1363 [ "$STATUS" == "stopped" ] ||
1364 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1367 for k in $(seq $OSTCOUNT); do
1368 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1369 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1370 awk '/^status/ { print $2 }')
1371 [ "$STATUS" == "stopped" ] ||
1372 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1375 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1376 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1377 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1379 echo "All the LFSCK targets should be in 'completed' status."
1380 for k in $(seq $MDSCOUNT); do
1381 # The LFSCK status query internal is 30 seconds. For the case
1382 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1383 # time to guarantee the status sync up.
1384 wait_update_facet mds${k} "$LCTL get_param -n \
1385 mdd.$(facet_svc mds${k}).lfsck_layout |
1386 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1387 error "(14) MDS${k} is not the expected 'completed'"
1390 stop_full_debug_logging
1392 run_test 12 "single command to trigger LFSCK on all devices"
1396 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1397 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1398 echo "MDT-object FID."
1401 check_mount_and_prep
1403 echo "Inject failure stub to simulate bad lmm_oi"
1404 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1406 createmany -o $DIR/$tdir/f 32
1407 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1409 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1410 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1412 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1413 mdd.${MDT_DEV}.lfsck_layout |
1414 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1416 error "(2) unexpected status"
1419 local repaired=$($SHOW_LAYOUT |
1420 awk '/^repaired_others/ { print $2 }')
1421 [ $repaired -eq 32 ] ||
1422 error "(3) Fail to repair crashed lmm_oi: $repaired"
1424 run_test 13 "LFSCK can repair crashed lmm_oi"
1428 echo "The OST-object referenced by the MDT-object should be there;"
1429 echo "otherwise, the LFSCK should re-create the missing OST-object."
1432 check_mount_and_prep
1433 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1435 echo "Inject failure stub to simulate dangling referenced MDT-object"
1436 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1437 do_facet ost1 $LCTL set_param fail_loc=0x1610
1438 local count=$(precreated_ost_obj_count 0 0)
1440 createmany -o $DIR/$tdir/f $((count + 31))
1441 touch $DIR/$tdir/guard
1442 do_facet ost1 $LCTL set_param fail_loc=0
1444 start_full_debug_logging
1446 # exhaust other pre-created dangling cases
1447 count=$(precreated_ost_obj_count 0 0)
1448 createmany -o $DIR/$tdir/a $count ||
1449 error "(0) Fail to create $count files."
1451 echo "'ls' should fail because of dangling referenced MDT-object"
1452 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1454 echo "Trigger layout LFSCK to find out dangling reference"
1455 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1457 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1458 mdd.${MDT_DEV}.lfsck_layout |
1459 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1461 error "(3) unexpected status"
1464 local repaired=$($SHOW_LAYOUT |
1465 awk '/^repaired_dangling/ { print $2 }')
1466 [ $repaired -ge 32 ] ||
1467 error "(4) Fail to repair dangling reference: $repaired"
1469 echo "'stat' should fail because of not repair dangling by default"
1470 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1472 echo "Trigger layout LFSCK to repair dangling reference"
1473 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1475 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1476 mdd.${MDT_DEV}.lfsck_layout |
1477 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1479 error "(7) unexpected status"
1482 # There may be some async LFSCK updates in processing, wait for
1483 # a while until the target reparation has been done. LU-4970.
1485 echo "'stat' should success after layout LFSCK repairing"
1486 wait_update_facet client "stat $DIR/$tdir/guard |
1487 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1488 stat $DIR/$tdir/guard
1490 error "(8) unexpected size"
1493 repaired=$($SHOW_LAYOUT |
1494 awk '/^repaired_dangling/ { print $2 }')
1495 [ $repaired -ge 32 ] ||
1496 error "(9) Fail to repair dangling reference: $repaired"
1498 stop_full_debug_logging
1500 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1504 echo "If the OST-object referenced by the MDT-object back points"
1505 echo "to some non-exist MDT-object, then the LFSCK should repair"
1506 echo "the OST-object to back point to the right MDT-object."
1509 check_mount_and_prep
1510 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1512 echo "Inject failure stub to make the OST-object to back point to"
1513 echo "non-exist MDT-object."
1514 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1516 do_facet ost1 $LCTL set_param fail_loc=0x1611
1517 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1518 cancel_lru_locks osc
1519 do_facet ost1 $LCTL set_param fail_loc=0
1521 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1522 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1524 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1525 mdd.${MDT_DEV}.lfsck_layout |
1526 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1528 error "(2) unexpected status"
1531 local repaired=$($SHOW_LAYOUT |
1532 awk '/^repaired_unmatched_pair/ { print $2 }')
1533 [ $repaired -eq 1 ] ||
1534 error "(3) Fail to repair unmatched pair: $repaired"
1536 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1540 echo "If the OST-object referenced by the MDT-object back points"
1541 echo "to other MDT-object that doesn't recognize the OST-object,"
1542 echo "then the LFSCK should repair it to back point to the right"
1543 echo "MDT-object (the first one)."
1546 check_mount_and_prep
1547 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1548 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1549 cancel_lru_locks osc
1551 echo "Inject failure stub to make the OST-object to back point to"
1552 echo "other MDT-object"
1554 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1555 do_facet ost1 $LCTL set_param fail_loc=0x1612
1556 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1557 cancel_lru_locks osc
1558 do_facet ost1 $LCTL set_param fail_loc=0
1560 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1561 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1563 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1564 mdd.${MDT_DEV}.lfsck_layout |
1565 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1567 error "(2) unexpected status"
1570 local repaired=$($SHOW_LAYOUT |
1571 awk '/^repaired_unmatched_pair/ { print $2 }')
1572 [ $repaired -eq 1 ] ||
1573 error "(3) Fail to repair unmatched pair: $repaired"
1575 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1578 [ $MDSCOUNT -lt 2 ] &&
1579 skip "We need at least 2 MDSes for this test" && return
1582 echo "According to current metadata migration implementation,"
1583 echo "before the old MDT-object is removed, both the new MDT-object"
1584 echo "and old MDT-object will reference the same LOV layout. Then if"
1585 echo "the layout LFSCK finds the new MDT-object by race, it will"
1586 echo "regard related OST-object(s) as multiple referenced case, and"
1587 echo "will try to create new OST-object(s) for the new MDT-object."
1588 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1589 echo "MDT-object before confirm the multiple referenced case."
1592 check_mount_and_prep
1593 $LFS mkdir -i 1 $DIR/$tdir/a1
1594 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1595 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1596 cancel_lru_locks osc
1598 echo "Inject failure stub on MDT1 to delay the migration"
1600 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1601 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1602 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1603 $LFS mv -M 0 $DIR/$tdir/a1 &
1606 echo "Trigger layout LFSCK to race with the migration"
1607 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1609 for k in $(seq $MDSCOUNT); do
1610 # The LFSCK status query internal is 30 seconds. For the case
1611 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1612 # time to guarantee the status sync up.
1613 wait_update_facet mds${k} "$LCTL get_param -n \
1614 mdd.$(facet_svc mds${k}).lfsck_layout |
1615 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1616 error "(2) MDS${k} is not the expected 'completed'"
1619 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1620 local repaired=$($SHOW_LAYOUT |
1621 awk '/^repaired_unmatched_pair/ { print $2 }')
1622 [ $repaired -eq 1 ] ||
1623 error "(3) Fail to repair unmatched pair: $repaired"
1625 repaired=$($SHOW_LAYOUT |
1626 awk '/^repaired_multiple_referenced/ { print $2 }')
1627 [ $repaired -eq 0 ] ||
1628 error "(4) Unexpectedly repaird multiple references: $repaired"
1630 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1634 echo "If the OST-object's owner information does not match the owner"
1635 echo "information stored in the MDT-object, then the LFSCK trust the"
1636 echo "MDT-object and update the OST-object's owner information."
1639 check_mount_and_prep
1640 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1641 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1642 cancel_lru_locks osc
1644 echo "Inject failure stub to skip OST-object owner changing"
1645 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1647 chown 1.1 $DIR/$tdir/f0
1648 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1650 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1653 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1655 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1656 mdd.${MDT_DEV}.lfsck_layout |
1657 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1659 error "(2) unexpected status"
1662 local repaired=$($SHOW_LAYOUT |
1663 awk '/^repaired_inconsistent_owner/ { print $2 }')
1664 [ $repaired -eq 1 ] ||
1665 error "(3) Fail to repair inconsistent owner: $repaired"
1667 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1671 echo "If more than one MDT-objects reference the same OST-object,"
1672 echo "and the OST-object only recognizes one MDT-object, then the"
1673 echo "LFSCK should create new OST-objects for such non-recognized"
1677 check_mount_and_prep
1678 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1680 echo "Inject failure stub to make two MDT-objects to refernce"
1681 echo "the OST-object"
1683 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1684 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1686 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1687 cancel_lru_locks osc
1689 createmany -o $DIR/$tdir/f 1
1691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1693 cancel_lru_locks mdc
1694 cancel_lru_locks osc
1696 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1697 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1698 [ $size -eq 1048576 ] ||
1699 error "(1) f0 (wrong) size should be 1048576, but got $size"
1701 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1704 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1707 mdd.${MDT_DEV}.lfsck_layout |
1708 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1710 error "(3) unexpected status"
1713 local repaired=$($SHOW_LAYOUT |
1714 awk '/^repaired_multiple_referenced/ { print $2 }')
1715 [ $repaired -eq 1 ] ||
1716 error "(4) Fail to repair multiple references: $repaired"
1718 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1719 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1720 error "(5) Fail to write f0."
1721 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1722 [ $size -eq 1048576 ] ||
1723 error "(6) guard size should be 1048576, but got $size"
1725 run_test 17 "LFSCK can repair multiple references"
1727 $LCTL set_param debug=+cache > /dev/null
1731 echo "The target MDT-object is there, but related stripe information"
1732 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1733 echo "layout EA entries."
1736 check_mount_and_prep
1737 $LFS mkdir -i 0 $DIR/$tdir/a1
1738 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1739 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1741 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1743 $LFS path2fid $DIR/$tdir/a1/f1
1744 $LFS getstripe $DIR/$tdir/a1/f1
1746 if [ $MDSCOUNT -ge 2 ]; then
1747 $LFS mkdir -i 1 $DIR/$tdir/a2
1748 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1749 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1750 $LFS path2fid $DIR/$tdir/a2/f2
1751 $LFS getstripe $DIR/$tdir/a2/f2
1754 cancel_lru_locks osc
1756 echo "Inject failure, to make the MDT-object lost its layout EA"
1757 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1758 do_facet mds1 $LCTL set_param fail_loc=0x1615
1759 chown 1.1 $DIR/$tdir/a1/f1
1761 if [ $MDSCOUNT -ge 2 ]; then
1762 do_facet mds2 $LCTL set_param fail_loc=0x1615
1763 chown 1.1 $DIR/$tdir/a2/f2
1769 do_facet mds1 $LCTL set_param fail_loc=0
1770 if [ $MDSCOUNT -ge 2 ]; then
1771 do_facet mds2 $LCTL set_param fail_loc=0
1774 cancel_lru_locks mdc
1775 cancel_lru_locks osc
1777 echo "The file size should be incorrect since layout EA is lost"
1778 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1779 [ "$cur_size" != "$saved_size" ] ||
1780 error "(1) Expect incorrect file1 size"
1782 if [ $MDSCOUNT -ge 2 ]; then
1783 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1784 [ "$cur_size" != "$saved_size" ] ||
1785 error "(2) Expect incorrect file2 size"
1788 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1789 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1791 for k in $(seq $MDSCOUNT); do
1792 # The LFSCK status query internal is 30 seconds. For the case
1793 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1794 # time to guarantee the status sync up.
1795 wait_update_facet mds${k} "$LCTL get_param -n \
1796 mdd.$(facet_svc mds${k}).lfsck_layout |
1797 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1798 error "(4) MDS${k} is not the expected 'completed'"
1801 for k in $(seq $OSTCOUNT); do
1802 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1803 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1804 awk '/^status/ { print $2 }')
1805 [ "$cur_status" == "completed" ] ||
1806 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1809 local repaired=$(do_facet mds1 $LCTL get_param -n \
1810 mdd.$(facet_svc mds1).lfsck_layout |
1811 awk '/^repaired_orphan/ { print $2 }')
1812 [ $repaired -eq 1 ] ||
1813 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1815 if [ $MDSCOUNT -ge 2 ]; then
1816 repaired=$(do_facet mds2 $LCTL get_param -n \
1817 mdd.$(facet_svc mds2).lfsck_layout |
1818 awk '/^repaired_orphan/ { print $2 }')
1819 [ $repaired -eq 2 ] ||
1820 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1823 $LFS path2fid $DIR/$tdir/a1/f1
1824 $LFS getstripe $DIR/$tdir/a1/f1
1826 if [ $MDSCOUNT -ge 2 ]; then
1827 $LFS path2fid $DIR/$tdir/a2/f2
1828 $LFS getstripe $DIR/$tdir/a2/f2
1831 echo "The file size should be correct after layout LFSCK scanning"
1832 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1833 [ "$cur_size" == "$saved_size" ] ||
1834 error "(7) Expect file1 size $saved_size, but got $cur_size"
1836 if [ $MDSCOUNT -ge 2 ]; then
1837 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1838 [ "$cur_size" == "$saved_size" ] ||
1839 error "(8) Expect file2 size $saved_size, but got $cur_size"
1842 run_test 18a "Find out orphan OST-object and repair it (1)"
1846 echo "The target MDT-object is lost. The LFSCK should re-create the"
1847 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1848 echo "can move it back to normal namespace manually."
1851 check_mount_and_prep
1852 $LFS mkdir -i 0 $DIR/$tdir/a1
1853 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1854 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1855 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1856 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1858 $LFS getstripe $DIR/$tdir/a1/f1
1860 if [ $MDSCOUNT -ge 2 ]; then
1861 $LFS mkdir -i 1 $DIR/$tdir/a2
1862 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1863 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1864 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1866 $LFS getstripe $DIR/$tdir/a2/f2
1869 cancel_lru_locks osc
1871 echo "Inject failure, to simulate the case of missing the MDT-object"
1872 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1873 do_facet mds1 $LCTL set_param fail_loc=0x1616
1874 rm -f $DIR/$tdir/a1/f1
1876 if [ $MDSCOUNT -ge 2 ]; then
1877 do_facet mds2 $LCTL set_param fail_loc=0x1616
1878 rm -f $DIR/$tdir/a2/f2
1884 do_facet mds1 $LCTL set_param fail_loc=0
1885 if [ $MDSCOUNT -ge 2 ]; then
1886 do_facet mds2 $LCTL set_param fail_loc=0
1889 cancel_lru_locks mdc
1890 cancel_lru_locks osc
1892 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1893 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1895 for k in $(seq $MDSCOUNT); do
1896 # The LFSCK status query internal is 30 seconds. For the case
1897 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1898 # time to guarantee the status sync up.
1899 wait_update_facet mds${k} "$LCTL get_param -n \
1900 mdd.$(facet_svc mds${k}).lfsck_layout |
1901 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1902 error "(2) MDS${k} is not the expected 'completed'"
1905 for k in $(seq $OSTCOUNT); do
1906 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1907 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1908 awk '/^status/ { print $2 }')
1909 [ "$cur_status" == "completed" ] ||
1910 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1913 local repaired=$(do_facet mds1 $LCTL get_param -n \
1914 mdd.$(facet_svc mds1).lfsck_layout |
1915 awk '/^repaired_orphan/ { print $2 }')
1916 [ $repaired -eq 1 ] ||
1917 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1919 if [ $MDSCOUNT -ge 2 ]; then
1920 repaired=$(do_facet mds2 $LCTL get_param -n \
1921 mdd.$(facet_svc mds2).lfsck_layout |
1922 awk '/^repaired_orphan/ { print $2 }')
1923 [ $repaired -eq 2 ] ||
1924 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1927 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1928 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1929 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1931 if [ $MDSCOUNT -ge 2 ]; then
1932 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1933 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1936 $LFS path2fid $DIR/$tdir/a1/f1
1937 $LFS getstripe $DIR/$tdir/a1/f1
1939 if [ $MDSCOUNT -ge 2 ]; then
1940 $LFS path2fid $DIR/$tdir/a2/f2
1941 $LFS getstripe $DIR/$tdir/a2/f2
1944 echo "The file size should be correct after layout LFSCK scanning"
1945 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1946 [ "$cur_size" == "$saved_size" ] ||
1947 error "(7) Expect file1 size $saved_size, but got $cur_size"
1949 if [ $MDSCOUNT -ge 2 ]; then
1950 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1951 [ "$cur_size" == "$saved_size" ] ||
1952 error "(8) Expect file2 size $saved_size, but got $cur_size"
1955 run_test 18b "Find out orphan OST-object and repair it (2)"
1959 echo "The target MDT-object is lost, and the OST-object FID is missing."
1960 echo "The LFSCK should re-create the MDT-object with new FID under the "
1961 echo "directory .lustre/lost+found/MDTxxxx."
1964 check_mount_and_prep
1965 $LFS mkdir -i 0 $DIR/$tdir/a1
1966 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1968 echo "Inject failure, to simulate the case of missing parent FID"
1969 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1970 do_facet ost1 $LCTL set_param fail_loc=0x1617
1972 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1973 $LFS getstripe $DIR/$tdir/a1/f1
1975 if [ $MDSCOUNT -ge 2 ]; then
1976 $LFS mkdir -i 1 $DIR/$tdir/a2
1977 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1978 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1979 $LFS getstripe $DIR/$tdir/a2/f2
1982 cancel_lru_locks osc
1984 echo "Inject failure, to simulate the case of missing the MDT-object"
1985 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1986 do_facet mds1 $LCTL set_param fail_loc=0x1616
1987 rm -f $DIR/$tdir/a1/f1
1989 if [ $MDSCOUNT -ge 2 ]; then
1990 do_facet mds2 $LCTL set_param fail_loc=0x1616
1991 rm -f $DIR/$tdir/a2/f2
1997 do_facet mds1 $LCTL set_param fail_loc=0
1998 if [ $MDSCOUNT -ge 2 ]; then
1999 do_facet mds2 $LCTL set_param fail_loc=0
2002 cancel_lru_locks mdc
2003 cancel_lru_locks osc
2005 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2006 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2008 for k in $(seq $MDSCOUNT); do
2009 # The LFSCK status query internal is 30 seconds. For the case
2010 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2011 # time to guarantee the status sync up.
2012 wait_update_facet mds${k} "$LCTL get_param -n \
2013 mdd.$(facet_svc mds${k}).lfsck_layout |
2014 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2015 error "(2) MDS${k} is not the expected 'completed'"
2018 for k in $(seq $OSTCOUNT); do
2019 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2020 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2021 awk '/^status/ { print $2 }')
2022 [ "$cur_status" == "completed" ] ||
2023 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2026 if [ $MDSCOUNT -ge 2 ]; then
2032 local repaired=$(do_facet mds1 $LCTL get_param -n \
2033 mdd.$(facet_svc mds1).lfsck_layout |
2034 awk '/^repaired_orphan/ { print $2 }')
2035 [ $repaired -eq $expected ] ||
2036 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2038 if [ $MDSCOUNT -ge 2 ]; then
2039 repaired=$(do_facet mds2 $LCTL get_param -n \
2040 mdd.$(facet_svc mds2).lfsck_layout |
2041 awk '/^repaired_orphan/ { print $2 }')
2042 [ $repaired -eq 0 ] ||
2043 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2046 ls -ail $MOUNT/.lustre/lost+found/
2048 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2049 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2050 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2052 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2055 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2056 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2057 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2059 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2060 [ ! -z "$cname" ] ||
2061 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2063 run_test 18c "Find out orphan OST-object and repair it (3)"
2067 echo "The target MDT-object layout EA slot is occpuied by some new"
2068 echo "created OST-object when repair dangling reference case. Such"
2069 echo "conflict OST-object has never been modified. Then when found"
2070 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2074 check_mount_and_prep
2076 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2077 echo "guard" > $DIR/$tdir/a1/f1
2078 echo "foo" > $DIR/$tdir/a1/f2
2079 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2080 $LFS path2fid $DIR/$tdir/a1/f1
2081 $LFS getstripe $DIR/$tdir/a1/f1
2082 $LFS path2fid $DIR/$tdir/a1/f2
2083 $LFS getstripe $DIR/$tdir/a1/f2
2084 cancel_lru_locks osc
2086 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2087 echo "to reference the same OST-object (which is f1's OST-obejct)."
2088 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2089 echo "dangling reference case, but f2's old OST-object is there."
2092 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2094 chown 1.1 $DIR/$tdir/a1/f2
2095 rm -f $DIR/$tdir/a1/f1
2098 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2100 echo "stopall to cleanup object cache"
2103 setupall > /dev/null
2105 echo "The file size should be incorrect since dangling referenced"
2106 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2107 [ "$cur_size" != "$saved_size" ] ||
2108 error "(1) Expect incorrect file2 size"
2110 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2111 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2113 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2114 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2116 wait_update_facet mds1 "$LCTL get_param -n \
2117 mdd.$(facet_svc mds1).lfsck_layout |
2118 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2119 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2121 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2123 for k in $(seq $MDSCOUNT); do
2124 # The LFSCK status query internal is 30 seconds. For the case
2125 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2126 # time to guarantee the status sync up.
2127 wait_update_facet mds${k} "$LCTL get_param -n \
2128 mdd.$(facet_svc mds${k}).lfsck_layout |
2129 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2130 error "(3) MDS${k} is not the expected 'completed'"
2133 for k in $(seq $OSTCOUNT); do
2134 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2135 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2136 awk '/^status/ { print $2 }')
2137 [ "$cur_status" == "completed" ] ||
2138 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2141 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2142 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2143 awk '/^repaired_orphan/ { print $2 }')
2144 [ $repaired -eq 1 ] ||
2145 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2147 echo "The file size should be correct after layout LFSCK scanning"
2148 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2149 [ "$cur_size" == "$saved_size" ] ||
2150 error "(6) Expect file2 size $saved_size, but got $cur_size"
2152 echo "The LFSCK should find back the original data."
2153 cat $DIR/$tdir/a1/f2
2154 $LFS path2fid $DIR/$tdir/a1/f2
2155 $LFS getstripe $DIR/$tdir/a1/f2
2157 run_test 18d "Find out orphan OST-object and repair it (4)"
2161 echo "The target MDT-object layout EA slot is occpuied by some new"
2162 echo "created OST-object when repair dangling reference case. Such"
2163 echo "conflict OST-object has been modified by others. To keep the"
2164 echo "new data, the LFSCK will create a new file to refernece this"
2165 echo "old orphan OST-object."
2168 check_mount_and_prep
2170 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2171 echo "guard" > $DIR/$tdir/a1/f1
2172 echo "foo" > $DIR/$tdir/a1/f2
2173 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2174 $LFS path2fid $DIR/$tdir/a1/f1
2175 $LFS getstripe $DIR/$tdir/a1/f1
2176 $LFS path2fid $DIR/$tdir/a1/f2
2177 $LFS getstripe $DIR/$tdir/a1/f2
2178 cancel_lru_locks osc
2180 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2181 echo "to reference the same OST-object (which is f1's OST-obejct)."
2182 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2183 echo "dangling reference case, but f2's old OST-object is there."
2186 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2188 chown 1.1 $DIR/$tdir/a1/f2
2189 rm -f $DIR/$tdir/a1/f1
2192 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2194 echo "stopall to cleanup object cache"
2197 setupall > /dev/null
2199 echo "The file size should be incorrect since dangling referenced"
2200 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2201 [ "$cur_size" != "$saved_size" ] ||
2202 error "(1) Expect incorrect file2 size"
2204 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2205 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2207 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2208 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2210 wait_update_facet mds1 "$LCTL get_param -n \
2211 mdd.$(facet_svc mds1).lfsck_layout |
2212 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2213 error "(3) MDS1 is not the expected 'scanning-phase2'"
2215 # to guarantee all updates are synced.
2219 echo "Write new data to f2 to modify the new created OST-object."
2220 echo "dummy" >> $DIR/$tdir/a1/f2
2222 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2224 for k in $(seq $MDSCOUNT); do
2225 # The LFSCK status query internal is 30 seconds. For the case
2226 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2227 # time to guarantee the status sync up.
2228 wait_update_facet mds${k} "$LCTL get_param -n \
2229 mdd.$(facet_svc mds${k}).lfsck_layout |
2230 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2231 error "(4) MDS${k} is not the expected 'completed'"
2234 for k in $(seq $OSTCOUNT); do
2235 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2236 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2237 awk '/^status/ { print $2 }')
2238 [ "$cur_status" == "completed" ] ||
2239 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2242 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2243 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2244 awk '/^repaired_orphan/ { print $2 }')
2245 [ $repaired -eq 1 ] ||
2246 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2248 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2249 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2250 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2252 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2253 [ ! -z "$cname" ] ||
2254 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2256 echo "The stub file should keep the original f2 data"
2257 cur_size=$(ls -il $cname | awk '{ print $6 }')
2258 [ "$cur_size" == "$saved_size" ] ||
2259 error "(9) Expect file2 size $saved_size, but got $cur_size"
2262 $LFS path2fid $cname
2263 $LFS getstripe $cname
2265 echo "The f2 should contains new data."
2266 cat $DIR/$tdir/a1/f2
2267 $LFS path2fid $DIR/$tdir/a1/f2
2268 $LFS getstripe $DIR/$tdir/a1/f2
2270 run_test 18e "Find out orphan OST-object and repair it (5)"
2273 [ $OSTCOUNT -lt 2 ] &&
2274 skip "The test needs at least 2 OSTs" && return
2277 echo "The target MDT-object is lost. The LFSCK should re-create the"
2278 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2279 echo "to verify some OST-object(s) during the first stage-scanning,"
2280 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2281 echo "should not be affected."
2284 check_mount_and_prep
2285 $LFS mkdir -i 0 $DIR/$tdir/a1
2286 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2287 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2288 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2289 $LFS mkdir -i 0 $DIR/$tdir/a2
2290 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2291 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2292 $LFS getstripe $DIR/$tdir/a1/f1
2293 $LFS getstripe $DIR/$tdir/a2/f2
2295 if [ $MDSCOUNT -ge 2 ]; then
2296 $LFS mkdir -i 1 $DIR/$tdir/a3
2297 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2298 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2299 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2300 $LFS mkdir -i 1 $DIR/$tdir/a4
2301 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2302 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2303 $LFS getstripe $DIR/$tdir/a3/f3
2304 $LFS getstripe $DIR/$tdir/a4/f4
2307 cancel_lru_locks osc
2309 echo "Inject failure, to simulate the case of missing the MDT-object"
2310 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2311 do_facet mds1 $LCTL set_param fail_loc=0x1616
2312 rm -f $DIR/$tdir/a1/f1
2313 rm -f $DIR/$tdir/a2/f2
2315 if [ $MDSCOUNT -ge 2 ]; then
2316 do_facet mds2 $LCTL set_param fail_loc=0x1616
2317 rm -f $DIR/$tdir/a3/f3
2318 rm -f $DIR/$tdir/a4/f4
2324 do_facet mds1 $LCTL set_param fail_loc=0
2325 if [ $MDSCOUNT -ge 2 ]; then
2326 do_facet mds2 $LCTL set_param fail_loc=0
2329 cancel_lru_locks mdc
2330 cancel_lru_locks osc
2332 echo "Inject failure, to simulate the OST0 fail to handle"
2333 echo "MDT0 LFSCK request during the first-stage scanning."
2334 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2335 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2337 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2338 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2340 for k in $(seq $MDSCOUNT); do
2341 # The LFSCK status query internal is 30 seconds. For the case
2342 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2343 # time to guarantee the status sync up.
2344 wait_update_facet mds${k} "$LCTL get_param -n \
2345 mdd.$(facet_svc mds${k}).lfsck_layout |
2346 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2347 error "(2) MDS${k} is not the expected 'partial'"
2350 wait_update_facet ost1 "$LCTL get_param -n \
2351 obdfilter.$(facet_svc ost1).lfsck_layout |
2352 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2353 error "(3) OST1 is not the expected 'partial'"
2356 wait_update_facet ost2 "$LCTL get_param -n \
2357 obdfilter.$(facet_svc ost2).lfsck_layout |
2358 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2359 error "(4) OST2 is not the expected 'completed'"
2362 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2364 local repaired=$(do_facet mds1 $LCTL get_param -n \
2365 mdd.$(facet_svc mds1).lfsck_layout |
2366 awk '/^repaired_orphan/ { print $2 }')
2367 [ $repaired -eq 1 ] ||
2368 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2370 if [ $MDSCOUNT -ge 2 ]; then
2371 repaired=$(do_facet mds2 $LCTL get_param -n \
2372 mdd.$(facet_svc mds2).lfsck_layout |
2373 awk '/^repaired_orphan/ { print $2 }')
2374 [ $repaired -eq 1 ] ||
2375 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2378 echo "Trigger layout LFSCK on all devices again to cleanup"
2379 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2381 for k in $(seq $MDSCOUNT); do
2382 # The LFSCK status query internal is 30 seconds. For the case
2383 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2384 # time to guarantee the status sync up.
2385 wait_update_facet mds${k} "$LCTL get_param -n \
2386 mdd.$(facet_svc mds${k}).lfsck_layout |
2387 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2388 error "(8) MDS${k} is not the expected 'completed'"
2391 for k in $(seq $OSTCOUNT); do
2392 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2393 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2394 awk '/^status/ { print $2 }')
2395 [ "$cur_status" == "completed" ] ||
2396 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2400 local repaired=$(do_facet mds1 $LCTL get_param -n \
2401 mdd.$(facet_svc mds1).lfsck_layout |
2402 awk '/^repaired_orphan/ { print $2 }')
2403 [ $repaired -eq 2 ] ||
2404 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2406 if [ $MDSCOUNT -ge 2 ]; then
2407 repaired=$(do_facet mds2 $LCTL get_param -n \
2408 mdd.$(facet_svc mds2).lfsck_layout |
2409 awk '/^repaired_orphan/ { print $2 }')
2410 [ $repaired -eq 2 ] ||
2411 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2414 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2416 $LCTL set_param debug=-cache > /dev/null
2419 check_mount_and_prep
2420 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2422 echo "foo" > $DIR/$tdir/a0
2423 echo "guard" > $DIR/$tdir/a1
2424 cancel_lru_locks osc
2426 echo "Inject failure, then client will offer wrong parent FID when read"
2427 do_facet ost1 $LCTL set_param -n \
2428 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2429 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2430 $LCTL set_param fail_loc=0x1619
2432 echo "Read RPC with wrong parent FID should be denied"
2433 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2434 $LCTL set_param fail_loc=0
2436 run_test 19a "OST-object inconsistency self detect"
2439 check_mount_and_prep
2440 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2442 echo "Inject failure stub to make the OST-object to back point to"
2443 echo "non-exist MDT-object"
2445 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2446 do_facet ost1 $LCTL set_param fail_loc=0x1611
2447 echo "foo" > $DIR/$tdir/f0
2448 cancel_lru_locks osc
2449 do_facet ost1 $LCTL set_param fail_loc=0
2451 echo "Nothing should be fixed since self detect and repair is disabled"
2452 local repaired=$(do_facet ost1 $LCTL get_param -n \
2453 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2454 awk '/^repaired/ { print $2 }')
2455 [ $repaired -eq 0 ] ||
2456 error "(1) Expected 0 repaired, but got $repaired"
2458 echo "Read RPC with right parent FID should be accepted,"
2459 echo "and cause parent FID on OST to be fixed"
2461 do_facet ost1 $LCTL set_param -n \
2462 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2463 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2465 repaired=$(do_facet ost1 $LCTL get_param -n \
2466 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2467 awk '/^repaired/ { print $2 }')
2468 [ $repaired -eq 1 ] ||
2469 error "(3) Expected 1 repaired, but got $repaired"
2471 run_test 19b "OST-object inconsistency self repair"
2474 [ $OSTCOUNT -lt 2 ] &&
2475 skip "The test needs at least 2 OSTs" && return
2478 echo "The target MDT-object and some of its OST-object are lost."
2479 echo "The LFSCK should find out the left OST-objects and re-create"
2480 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2481 echo "with the partial OST-objects (LOV EA hole)."
2483 echo "New client can access the file with LOV EA hole via normal"
2484 echo "system tools or commands without crash the system."
2486 echo "For old client, even though it cannot access the file with"
2487 echo "LOV EA hole, it should not cause the system crash."
2490 check_mount_and_prep
2491 $LFS mkdir -i 0 $DIR/$tdir/a1
2492 if [ $OSTCOUNT -gt 2 ]; then
2493 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2496 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2500 # 256 blocks on the stripe0.
2501 # 1 block on the stripe1 for 2 OSTs case.
2502 # 256 blocks on the stripe1 for other cases.
2503 # 1 block on the stripe2 if OSTs > 2
2504 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2505 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2506 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2508 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2509 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2510 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2513 $LFS getstripe $DIR/$tdir/a1/f0
2515 $LFS getstripe $DIR/$tdir/a1/f1
2517 $LFS getstripe $DIR/$tdir/a1/f2
2519 if [ $OSTCOUNT -gt 2 ]; then
2520 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2521 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2523 $LFS getstripe $DIR/$tdir/a1/f3
2526 cancel_lru_locks osc
2528 echo "Inject failure..."
2529 echo "To simulate f0 lost MDT-object"
2530 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2531 do_facet mds1 $LCTL set_param fail_loc=0x1616
2532 rm -f $DIR/$tdir/a1/f0
2534 echo "To simulate f1 lost MDT-object and OST-object0"
2535 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2536 do_facet mds1 $LCTL set_param fail_loc=0x161a
2537 rm -f $DIR/$tdir/a1/f1
2539 echo "To simulate f2 lost MDT-object and OST-object1"
2540 do_facet mds1 $LCTL set_param fail_val=1
2541 rm -f $DIR/$tdir/a1/f2
2543 if [ $OSTCOUNT -gt 2 ]; then
2544 echo "To simulate f3 lost MDT-object and OST-object2"
2545 do_facet mds1 $LCTL set_param fail_val=2
2546 rm -f $DIR/$tdir/a1/f3
2549 umount_client $MOUNT
2552 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2554 echo "Inject failure to slow down the LFSCK on OST0"
2555 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2556 do_facet ost1 $LCTL set_param fail_loc=0x161b
2558 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2559 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2562 do_facet ost1 $LCTL set_param fail_loc=0
2564 for k in $(seq $MDSCOUNT); do
2565 # The LFSCK status query internal is 30 seconds. For the case
2566 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2567 # time to guarantee the status sync up.
2568 wait_update_facet mds${k} "$LCTL get_param -n \
2569 mdd.$(facet_svc mds${k}).lfsck_layout |
2570 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2571 error "(2) MDS${k} is not the expected 'completed'"
2574 for k in $(seq $OSTCOUNT); do
2575 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2576 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2577 awk '/^status/ { print $2 }')
2578 [ "$cur_status" == "completed" ] ||
2579 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2582 local repaired=$(do_facet mds1 $LCTL get_param -n \
2583 mdd.$(facet_svc mds1).lfsck_layout |
2584 awk '/^repaired_orphan/ { print $2 }')
2585 if [ $OSTCOUNT -gt 2 ]; then
2586 [ $repaired -eq 9 ] ||
2587 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2589 [ $repaired -eq 4 ] ||
2590 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2593 mount_client $MOUNT || error "(5.0) Fail to start client!"
2595 LOV_PATTERN_F_HOLE=0x40000000
2598 # ${fid0}-R-0 is the old f0
2600 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2601 echo "Check $name, which is the old f0"
2603 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2605 local pattern=0x$($LFS getstripe -L $name)
2606 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2607 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2609 local stripes=$($LFS getstripe -c $name)
2610 if [ $OSTCOUNT -gt 2 ]; then
2611 [ $stripes -eq 3 ] ||
2612 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2614 [ $stripes -eq 2 ] ||
2615 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2618 local size=$(stat $name | awk '/Size:/ { print $2 }')
2619 [ $size -eq $((4096 * $bcount)) ] ||
2620 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2622 cat $name > /dev/null || error "(5.5) cannot read $name"
2624 echo "dummy" >> $name || error "(5.6) cannot write $name"
2626 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2628 touch $name || error "(5.8) cannot touch $name"
2630 rm -f $name || error "(5.9) cannot unlink $name"
2633 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2635 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2636 if [ $OSTCOUNT -gt 2 ]; then
2637 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2639 echo "Check $name, it contains the old f1's stripe1"
2642 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2644 pattern=0x$($LFS getstripe -L $name)
2645 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2646 error "(6.2) expect pattern flag hole, but got $pattern"
2648 stripes=$($LFS getstripe -c $name)
2649 if [ $OSTCOUNT -gt 2 ]; then
2650 [ $stripes -eq 3 ] ||
2651 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2653 [ $stripes -eq 2 ] ||
2654 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2657 size=$(stat $name | awk '/Size:/ { print $2 }')
2658 [ $size -eq $((4096 * $bcount)) ] ||
2659 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2661 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2663 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2664 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2667 [ $failures -eq 256 ] ||
2668 error "(6.6) expect 256 IO failures, but get $failures"
2670 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2671 [ $size -eq $((4096 * $bcount)) ] ||
2672 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2674 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2675 error "(6.8) write to the LOV EA hole should fail"
2677 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2678 error "(6.9) write to normal stripe should NOT fail"
2680 echo "foo" >> $name && error "(6.10) append write $name should fail"
2682 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2684 touch $name || error "(6.12) cannot touch $name"
2686 rm -f $name || error "(6.13) cannot unlink $name"
2689 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2691 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2692 if [ $OSTCOUNT -gt 2 ]; then
2693 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2695 echo "Check $name, it contains the old f2's stripe0"
2698 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2700 pattern=0x$($LFS getstripe -L $name)
2701 stripes=$($LFS getstripe -c $name)
2702 size=$(stat $name | awk '/Size:/ { print $2 }')
2703 if [ $OSTCOUNT -gt 2 ]; then
2704 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2705 error "(7.2.1) expect pattern flag hole, but got $pattern"
2707 [ $stripes -eq 3 ] ||
2708 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2710 [ $size -eq $((4096 * $bcount)) ] ||
2711 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2713 cat $name > /dev/null &&
2714 error "(7.5.1) normal read $name should fail"
2716 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2717 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2719 [ $failures -eq 256 ] ||
2720 error "(7.6) expect 256 IO failures, but get $failures"
2722 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2723 [ $size -eq $((4096 * $bcount)) ] ||
2724 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2726 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2727 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2729 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2730 error "(7.8.1) write to normal stripe should NOT fail"
2732 echo "foo" >> $name &&
2733 error "(7.8.3) append write $name should fail"
2735 chown $RUNAS_ID:$RUNAS_GID $name ||
2736 error "(7.9.1) cannot chown on $name"
2738 touch $name || error "(7.10.1) cannot touch $name"
2740 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2741 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2743 [ $stripes -eq 1 ] ||
2744 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2747 [ $size -eq $((4096 * (256 + 0))) ] ||
2748 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2750 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2752 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2754 chown $RUNAS_ID:$RUNAS_GID $name ||
2755 error "(7.9.2) cannot chown on $name"
2757 touch $name || error "(7.10.2) cannot touch $name"
2760 rm -f $name || error "(7.11) cannot unlink $name"
2762 [ $OSTCOUNT -le 2 ] && return
2765 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2767 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2768 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2770 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2772 pattern=0x$($LFS getstripe -L $name)
2773 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2774 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2776 stripes=$($LFS getstripe -c $name)
2777 # LFSCK does not know the old f3 had 3 stripes.
2778 # It only tries to find as much as possible.
2779 # The stripe count depends on the last stripe's offset.
2780 [ $stripes -eq 2 ] ||
2781 error "(8.3) expect the stripe count is 2, but got $stripes"
2783 size=$(stat $name | awk '/Size:/ { print $2 }')
2785 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2786 error "(8.4) expect the size $((4096 * 512)), but got $size"
2788 cat $name > /dev/null || error "(8.5) cannot read $name"
2790 echo "dummy" >> $name || error "(8.6) cannot write $name"
2792 chown $RUNAS_ID:$RUNAS_GID $name ||
2793 error "(8.7) cannot chown on $name"
2795 touch $name || error "(8.8) cannot touch $name"
2797 rm -f $name || error "(8.9) cannot unlink $name"
2799 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2802 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2803 skip "ignore the test if MDS is older than 2.5.59" && return
2805 check_mount_and_prep
2806 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2808 echo "Start all LFSCK components by default (-s 1)"
2809 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2810 error "Fail to start LFSCK"
2812 echo "namespace LFSCK should be in 'scanning-phase1' status"
2813 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2814 [ "$STATUS" == "scanning-phase1" ] ||
2815 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2817 echo "layout LFSCK should be in 'scanning-phase1' status"
2818 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2819 [ "$STATUS" == "scanning-phase1" ] ||
2820 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2822 echo "Stop all LFSCK components by default"
2823 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2824 error "Fail to stop LFSCK"
2826 run_test 21 "run all LFSCK components by default"
2829 [ $MDSCOUNT -lt 2 ] &&
2830 skip "We need at least 2 MDSes for this test" && return
2833 echo "The parent_A references the child directory via some name entry,"
2834 echo "but the child directory back references another parent_B via its"
2835 echo "".." name entry. The parent_B does not exist. Then the namespace"
2836 echo "LFSCK will repair the child directory's ".." name entry."
2839 check_mount_and_prep
2841 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2842 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2844 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2845 echo "The dummy's dotdot name entry references the guard."
2846 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2847 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2848 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2849 error "(3) Fail to mkdir on MDT0"
2850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2852 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2854 echo "Trigger namespace LFSCK to repair unmatched pairs"
2855 $START_NAMESPACE -A -r ||
2856 error "(5) Fail to start LFSCK for namespace"
2858 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2859 mdd.${MDT_DEV}.lfsck_namespace |
2860 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2862 error "(6) unexpected status"
2865 local repaired=$($SHOW_NAMESPACE |
2866 awk '/^unmatched_pairs_repaired/ { print $2 }')
2867 [ $repaired -eq 1 ] ||
2868 error "(7) Fail to repair unmatched pairs: $repaired"
2870 echo "'ls' should success after namespace LFSCK repairing"
2871 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2872 error "(8) ls should success."
2874 run_test 22a "LFSCK can repair unmatched pairs (1)"
2877 [ $MDSCOUNT -lt 2 ] &&
2878 skip "We need at least 2 MDSes for this test" && return
2881 echo "The parent_A references the child directory via the name entry_B,"
2882 echo "but the child directory back references another parent_C via its"
2883 echo "".." name entry. The parent_C exists, but there is no the name"
2884 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2885 echo "the child directory's ".." name entry and its linkEA."
2888 check_mount_and_prep
2890 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2891 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2893 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2894 echo "and bad linkEA. The dummy's dotdot name entry references the"
2895 echo "guard. The dummy's linkEA references n non-exist name entry."
2896 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2898 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2899 error "(3) Fail to mkdir on MDT0"
2900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2902 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2903 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2904 local dummyname=$($LFS fid2path $DIR $dummyfid)
2905 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2906 error "(4) fid2path works unexpectedly."
2908 echo "Trigger namespace LFSCK to repair unmatched pairs"
2909 $START_NAMESPACE -A -r ||
2910 error "(5) Fail to start LFSCK for namespace"
2912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2913 mdd.${MDT_DEV}.lfsck_namespace |
2914 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2916 error "(6) unexpected status"
2919 local repaired=$($SHOW_NAMESPACE |
2920 awk '/^unmatched_pairs_repaired/ { print $2 }')
2921 [ $repaired -eq 1 ] ||
2922 error "(7) Fail to repair unmatched pairs: $repaired"
2924 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2925 local dummyname=$($LFS fid2path $DIR $dummyfid)
2926 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2927 error "(8) fid2path does not work"
2929 run_test 22b "LFSCK can repair unmatched pairs (2)"
2932 [ $MDSCOUNT -lt 2 ] &&
2933 skip "We need at least 2 MDSes for this test" && return
2936 echo "The name entry is there, but the MDT-object for such name "
2937 echo "entry does not exist. The namespace LFSCK should find out "
2938 echo "and repair the inconsistency as required."
2941 check_mount_and_prep
2943 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2944 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2946 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2947 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2948 do_facet mds2 $LCTL set_param fail_loc=0x1620
2949 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2950 do_facet mds2 $LCTL set_param fail_loc=0
2952 echo "'ls' should fail because of dangling name entry"
2953 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2955 echo "Trigger namespace LFSCK to find out dangling name entry"
2956 $START_NAMESPACE -A -r ||
2957 error "(5) Fail to start LFSCK for namespace"
2959 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2960 mdd.${MDT_DEV}.lfsck_namespace |
2961 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2963 error "(6) unexpected status"
2966 local repaired=$($SHOW_NAMESPACE |
2967 awk '/^dangling_repaired/ { print $2 }')
2968 [ $repaired -eq 1 ] ||
2969 error "(7) Fail to repair dangling name entry: $repaired"
2971 echo "'ls' should fail because not re-create MDT-object by default"
2972 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2974 echo "Trigger namespace LFSCK again to repair dangling name entry"
2975 $START_NAMESPACE -A -r -C ||
2976 error "(9) Fail to start LFSCK for namespace"
2978 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2979 mdd.${MDT_DEV}.lfsck_namespace |
2980 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2982 error "(10) unexpected status"
2985 repaired=$($SHOW_NAMESPACE |
2986 awk '/^dangling_repaired/ { print $2 }')
2987 [ $repaired -eq 1 ] ||
2988 error "(11) Fail to repair dangling name entry: $repaired"
2990 echo "'ls' should success after namespace LFSCK repairing"
2991 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
2993 run_test 23a "LFSCK can repair dangling name entry (1)"
2997 echo "The objectA has multiple hard links, one of them corresponding"
2998 echo "to the name entry_B. But there is something wrong for the name"
2999 echo "entry_B and cause entry_B to references non-exist object_C."
3000 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3001 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3002 echo "comes to the second-stage scanning, it will find that the"
3003 echo "former re-creating object_C is not proper, and will try to"
3004 echo "replace the object_C with the real object_A."
3007 check_mount_and_prep
3009 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3010 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3011 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3013 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3014 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3015 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3016 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3019 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3021 echo "'ls' should fail because of dangling name entry"
3022 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3023 error "(6) ls should fail."
3025 echo "Trigger namespace LFSCK to find out dangling name entry"
3026 $START_NAMESPACE -r -C ||
3027 error "(7) Fail to start LFSCK for namespace"
3029 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3030 mdd.${MDT_DEV}.lfsck_namespace |
3031 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3033 error "(8) unexpected status"
3036 local repaired=$($SHOW_NAMESPACE |
3037 awk '/^dangling_repaired/ { print $2 }')
3038 [ $repaired -eq 1 ] ||
3039 error "(9) Fail to repair dangling name entry: $repaired"
3041 repaired=$($SHOW_NAMESPACE |
3042 awk '/^multiple_linked_repaired/ { print $2 }')
3043 [ $repaired -eq 1 ] ||
3044 error "(10) Fail to drop the former created object: $repaired"
3046 local data=$(cat $DIR/$tdir/d0/foo)
3047 [ "$data" == "dummy" ] ||
3048 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3050 run_test 23b "LFSCK can repair dangling name entry (2)"
3054 echo "The objectA has multiple hard links, one of them corresponding"
3055 echo "to the name entry_B. But there is something wrong for the name"
3056 echo "entry_B and cause entry_B to references non-exist object_C."
3057 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3058 echo "as dangling, and re-create the lost object_C. And then others"
3059 echo "modified the re-created object_C. When the LFSCK comes to the"
3060 echo "second-stage scanning, it will find that the former re-creating"
3061 echo "object_C maybe wrong and try to replace the object_C with the"
3062 echo "real object_A. But because object_C has been modified, so the"
3063 echo "LFSCK cannot replace it."
3066 check_mount_and_prep
3068 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3069 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3070 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3072 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3073 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3074 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3075 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3076 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3078 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3080 echo "'ls' should fail because of dangling name entry"
3081 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3082 error "(6) ls should fail."
3084 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3085 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3087 echo "Trigger namespace LFSCK to find out dangling name entry"
3088 $START_NAMESPACE -r -C ||
3089 error "(7) Fail to start LFSCK for namespace"
3091 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3092 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3093 stat $DIR/$tdir/guard
3095 error "(8) unexpected size"
3098 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3099 cancel_lru_locks osc
3101 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3102 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3103 mdd.${MDT_DEV}.lfsck_namespace |
3104 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3106 error "(10) unexpected status"
3109 local repaired=$($SHOW_NAMESPACE |
3110 awk '/^dangling_repaired/ { print $2 }')
3111 [ $repaired -eq 1 ] ||
3112 error "(11) Fail to repair dangling name entry: $repaired"
3114 local data=$(cat $DIR/$tdir/d0/foo)
3115 [ "$data" != "dummy" ] ||
3116 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3118 run_test 23c "LFSCK can repair dangling name entry (3)"
3121 [ $MDSCOUNT -lt 2 ] &&
3122 skip "We need at least 2 MDSes for this test" && return
3125 echo "Two MDT-objects back reference the same name entry via their"
3126 echo "each own linkEA entry, but the name entry only references one"
3127 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3128 echo "for the MDT-object that is not recognized. If such MDT-object"
3129 echo "has no other linkEA entry after the removing, then the LFSCK"
3130 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3133 check_mount_and_prep
3135 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3137 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3138 $LFS path2fid $DIR/$tdir/d0/guard
3140 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3141 $LFS path2fid $DIR/$tdir/d0/dummy
3144 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3145 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3147 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3150 touch $DIR/$tdir/d0/guard/foo ||
3151 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3153 echo "Inject failure stub on MDT0 to simulate the case that"
3154 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3155 echo "that references $DIR/$tdir/d0/guard/foo."
3156 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3157 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3158 echo "there with the same linkEA entry as another MDT-object"
3159 echo "$DIR/$tdir/d0/guard/foo has"
3161 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3162 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3163 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3164 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3165 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3166 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3167 rmdir $DIR/$tdir/d0/dummy/foo ||
3168 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3171 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3172 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3173 error "(6) stat successfully unexpectedly"
3175 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3176 $START_NAMESPACE -A -r ||
3177 error "(7) Fail to start LFSCK for namespace"
3179 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3180 mdd.${MDT_DEV}.lfsck_namespace |
3181 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3183 error "(8) unexpected status"
3186 local repaired=$($SHOW_NAMESPACE |
3187 awk '/^multiple_referenced_repaired/ { print $2 }')
3188 [ $repaired -eq 1 ] ||
3189 error "(9) Fail to repair multiple referenced name entry: $repaired"
3191 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3192 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3193 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3195 local cname="$cfid-$pfid-D-0"
3196 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3197 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3199 run_test 24 "LFSCK can repair multiple-referenced name entry"
3202 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3203 skip "Only support to inject failure on ldiskfs" && return
3206 echo "The file type in the name entry does not match the file type"
3207 echo "claimed by the referenced object. Then the LFSCK will update"
3208 echo "the file type in the name entry."
3211 check_mount_and_prep
3213 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3215 echo "Inject failure stub on MDT0 to simulate the case that"
3216 echo "the file type stored in the name entry is wrong."
3218 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3220 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3221 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3223 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3224 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3227 mdd.${MDT_DEV}.lfsck_namespace |
3228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3230 error "(4) unexpected status"
3233 local repaired=$($SHOW_NAMESPACE |
3234 awk '/^bad_file_type_repaired/ { print $2 }')
3235 [ $repaired -eq 1 ] ||
3236 error "(5) Fail to repair bad file type in name entry: $repaired"
3238 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3240 run_test 25 "LFSCK can repair bad file type in the name entry"
3244 echo "The local name entry back referenced by the MDT-object is lost."
3245 echo "The namespace LFSCK will add the missing local name entry back"
3246 echo "to the normal namespace."
3249 check_mount_and_prep
3251 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3252 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3253 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3255 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3256 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3258 echo "Inject failure stub on MDT0 to simulate the case that"
3259 echo "foo's name entry will be removed, but the foo's object"
3260 echo "and its linkEA are kept in the system."
3262 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3264 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3267 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3269 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3270 $START_NAMESPACE -r -A ||
3271 error "(6) Fail to start LFSCK for namespace"
3273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3274 mdd.${MDT_DEV}.lfsck_namespace |
3275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3277 error "(7) unexpected status"
3280 local repaired=$($SHOW_NAMESPACE |
3281 awk '/^lost_dirent_repaired/ { print $2 }')
3282 [ $repaired -eq 1 ] ||
3283 error "(8) Fail to repair lost dirent: $repaired"
3285 ls -ail $DIR/$tdir/d0/foo ||
3286 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3288 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3289 [ "$foofid" == "$foofid2" ] ||
3290 error "(10) foo's FID changed: $foofid, $foofid2"
3292 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3295 [ $MDSCOUNT -lt 2 ] &&
3296 skip "We need at least 2 MDSes for this test" && return
3299 echo "The remote name entry back referenced by the MDT-object is lost."
3300 echo "The namespace LFSCK will add the missing remote name entry back"
3301 echo "to the normal namespace."
3304 check_mount_and_prep
3306 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3307 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3308 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3310 echo "Inject failure stub on MDT0 to simulate the case that"
3311 echo "foo's name entry will be removed, but the foo's object"
3312 echo "and its linkEA are kept in the system."
3314 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3315 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3316 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3317 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3319 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3321 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3322 $START_NAMESPACE -r -A ||
3323 error "(5) Fail to start LFSCK for namespace"
3325 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3326 mdd.${MDT_DEV}.lfsck_namespace |
3327 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3329 error "(6) unexpected status"
3332 local repaired=$($SHOW_NAMESPACE |
3333 awk '/^lost_dirent_repaired/ { print $2 }')
3334 [ $repaired -eq 1 ] ||
3335 error "(7) Fail to repair lost dirent: $repaired"
3337 ls -ail $DIR/$tdir/d0/foo ||
3338 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3340 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3341 [ "$foofid" == "$foofid2" ] ||
3342 error "(9) foo's FID changed: $foofid, $foofid2"
3344 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3348 echo "The local parent referenced by the MDT-object linkEA is lost."
3349 echo "The namespace LFSCK will re-create the lost parent as orphan."
3352 check_mount_and_prep
3354 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3355 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3356 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3357 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3359 echo "Inject failure stub on MDT0 to simulate the case that"
3360 echo "foo's name entry will be removed, but the foo's object"
3361 echo "and its linkEA are kept in the system. And then remove"
3362 echo "another hard link and the parent directory."
3364 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3366 rm -f $DIR/$tdir/d0/foo ||
3367 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3368 rm -f $DIR/$tdir/d0/dummy ||
3369 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3370 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3372 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3373 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3375 echo "Trigger namespace LFSCK to repair the lost parent"
3376 $START_NAMESPACE -r -A ||
3377 error "(6) Fail to start LFSCK for namespace"
3379 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3380 mdd.${MDT_DEV}.lfsck_namespace |
3381 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3383 error "(7) unexpected status"
3386 local repaired=$($SHOW_NAMESPACE |
3387 awk '/^lost_dirent_repaired/ { print $2 }')
3388 [ $repaired -eq 1 ] ||
3389 error "(8) Fail to repair lost dirent: $repaired"
3391 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3392 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3393 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3395 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3397 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3398 [ ! -z "$cname" ] ||
3399 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3401 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3404 [ $MDSCOUNT -lt 2 ] &&
3405 skip "We need at least 2 MDSes for this test" && return
3408 echo "The remote parent referenced by the MDT-object linkEA is lost."
3409 echo "The namespace LFSCK will re-create the lost parent as orphan."
3412 check_mount_and_prep
3414 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3415 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3417 $LFS path2fid $DIR/$tdir/d0
3419 echo "Inject failure stub on MDT0 to simulate the case that"
3420 echo "foo's name entry will be removed, but the foo's object"
3421 echo "and its linkEA are kept in the system. And then remove"
3422 echo "the parent directory."
3424 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3426 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3427 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3429 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3430 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3432 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3433 $START_NAMESPACE -r -A ||
3434 error "(6) Fail to start LFSCK for namespace"
3436 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3437 mdd.${MDT_DEV}.lfsck_namespace |
3438 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3440 error "(7) unexpected status"
3443 local repaired=$($SHOW_NAMESPACE |
3444 awk '/^lost_dirent_repaired/ { print $2 }')
3445 [ $repaired -eq 1 ] ||
3446 error "(8) Fail to repair lost dirent: $repaired"
3448 ls -ail $MOUNT/.lustre/lost+found/
3450 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3451 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3452 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3454 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3456 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3457 [ ! -z "$cname" ] ||
3458 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3460 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3463 [ $MDSCOUNT -lt 2 ] &&
3464 skip "The test needs at least 2 MDTs" && return
3467 echo "The target name entry is lost. The LFSCK should insert the"
3468 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3469 echo "the MDT (on which the orphan MDT-object resides) has ever"
3470 echo "failed to respond some name entry verification during the"
3471 echo "first stage-scanning, then the LFSCK should skip to handle"
3472 echo "orphan MDT-object on this MDT. But other MDTs should not"
3476 check_mount_and_prep
3477 $LFS mkdir -i 0 $DIR/$tdir/d1
3478 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3479 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3481 $LFS mkdir -i 1 $DIR/$tdir/d2
3482 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3483 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3485 echo "Inject failure stub on MDT0 to simulate the case that"
3486 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3487 echo "and its linkEA are kept in the system. And the case that"
3488 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3489 echo "and its linkEA are kept in the system."
3491 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3492 do_facet mds1 $LCTL set_param fail_loc=0x1624
3493 do_facet mds2 $LCTL set_param fail_loc=0x1624
3494 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3495 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3496 do_facet mds1 $LCTL set_param fail_loc=0
3497 do_facet mds2 $LCTL set_param fail_loc=0
3499 cancel_lru_locks mdc
3500 cancel_lru_locks osc
3502 echo "Inject failure, to simulate the MDT0 fail to handle"
3503 echo "MDT1 LFSCK request during the first-stage scanning."
3504 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3505 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3507 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3508 $START_NAMESPACE -r -A ||
3509 error "(3) Fail to start LFSCK for namespace"
3511 wait_update_facet mds1 "$LCTL get_param -n \
3512 mdd.$(facet_svc mds1).lfsck_namespace |
3513 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3514 error "(4) mds1 is not the expected 'partial'"
3517 wait_update_facet mds2 "$LCTL get_param -n \
3518 mdd.$(facet_svc mds2).lfsck_namespace |
3519 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3520 error "(5) mds2 is not the expected 'completed'"
3523 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3525 local repaired=$(do_facet mds1 $LCTL get_param -n \
3526 mdd.$(facet_svc mds1).lfsck_namespace |
3527 awk '/^lost_dirent_repaired/ { print $2 }')
3528 [ $repaired -eq 0 ] ||
3529 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3531 repaired=$(do_facet mds2 $LCTL get_param -n \
3532 mdd.$(facet_svc mds2).lfsck_namespace |
3533 awk '/^lost_dirent_repaired/ { print $2 }')
3534 [ $repaired -eq 1 ] ||
3535 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3537 echo "Trigger namespace LFSCK on all devices again to cleanup"
3538 $START_NAMESPACE -r -A ||
3539 error "(8) Fail to start LFSCK for namespace"
3541 for k in $(seq $MDSCOUNT); do
3542 # The LFSCK status query internal is 30 seconds. For the case
3543 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3544 # time to guarantee the status sync up.
3545 wait_update_facet mds${k} "$LCTL get_param -n \
3546 mdd.$(facet_svc mds${k}).lfsck_namespace |
3547 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3548 error "(9) MDS${k} is not the expected 'completed'"
3551 local repaired=$(do_facet mds1 $LCTL get_param -n \
3552 mdd.$(facet_svc mds1).lfsck_namespace |
3553 awk '/^lost_dirent_repaired/ { print $2 }')
3554 [ $repaired -eq 1 ] ||
3555 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3557 repaired=$(do_facet mds2 $LCTL get_param -n \
3558 mdd.$(facet_svc mds2).lfsck_namespace |
3559 awk '/^lost_dirent_repaired/ { print $2 }')
3560 [ $repaired -eq 0 ] ||
3561 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3563 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3567 echo "The object's nlink attribute is larger than the object's known"
3568 echo "name entries count. The LFSCK will repair the object's nlink"
3569 echo "attribute to match the known name entries count"
3572 check_mount_and_prep
3574 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3575 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3577 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3578 echo "nlink attribute is larger than its name entries count."
3580 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3582 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3583 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3586 cancel_lru_locks mdc
3587 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3588 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3590 echo "Trigger namespace LFSCK to repair the nlink count"
3591 $START_NAMESPACE -r -A ||
3592 error "(5) Fail to start LFSCK for namespace"
3594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3595 mdd.${MDT_DEV}.lfsck_namespace |
3596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3598 error "(6) unexpected status"
3601 local repaired=$($SHOW_NAMESPACE |
3602 awk '/^nlinks_repaired/ { print $2 }')
3603 [ $repaired -eq 1 ] ||
3604 error "(7) Fail to repair nlink count: $repaired"
3606 cancel_lru_locks mdc
3607 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3608 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3610 run_test 29a "LFSCK can repair bad nlink count (1)"
3614 echo "The object's nlink attribute is smaller than the object's known"
3615 echo "name entries count. The LFSCK will repair the object's nlink"
3616 echo "attribute to match the known name entries count"
3619 check_mount_and_prep
3621 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3622 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3624 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3625 echo "nlink attribute is smaller than its name entries count."
3627 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3628 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3629 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3630 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3631 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3633 cancel_lru_locks mdc
3634 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3635 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3637 echo "Trigger namespace LFSCK to repair the nlink count"
3638 $START_NAMESPACE -r -A ||
3639 error "(5) Fail to start LFSCK for namespace"
3641 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3642 mdd.${MDT_DEV}.lfsck_namespace |
3643 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3645 error "(6) unexpected status"
3648 local repaired=$($SHOW_NAMESPACE |
3649 awk '/^nlinks_repaired/ { print $2 }')
3650 [ $repaired -eq 1 ] ||
3651 error "(7) Fail to repair nlink count: $repaired"
3653 cancel_lru_locks mdc
3654 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3655 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3657 run_test 29b "LFSCK can repair bad nlink count (2)"
3661 echo "There are too many hard links to the object, and exceeds the"
3662 echo "object's linkEA limitation, as to NOT all the known name entries"
3663 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3664 echo "skip the nlink verification for this object."
3667 check_mount_and_prep
3669 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3670 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3671 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3672 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3674 echo "Inject failure stub on MDT0 to simulate the case that"
3675 echo "foo's hard links exceed the object's linkEA limitation."
3677 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3678 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3679 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3680 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3682 cancel_lru_locks mdc
3684 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3685 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3687 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3688 $LFS fid2path $DIR $foofid
3689 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3690 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3692 echo "Trigger namespace LFSCK to repair the nlink count"
3693 $START_NAMESPACE -r -A ||
3694 error "(7) Fail to start LFSCK for namespace"
3696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3697 mdd.${MDT_DEV}.lfsck_namespace |
3698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3700 error "(8) unexpected status"
3703 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3704 local repaired=$($SHOW_NAMESPACE |
3705 awk '/^nlinks_repaired/ { print $2 }')
3706 [ $repaired -eq 0 ] ||
3707 error "(9) Repair nlink count unexpcetedly: $repaired"
3709 cancel_lru_locks mdc
3711 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3712 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3714 count2=$($LFS fid2path $DIR $foofid | wc -l)
3715 [ $count2 -eq 2 ] ||
3716 error "(11) Repaired something unexpectedly: $count2"
3718 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3721 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3722 skip "Only support backend /lost+found for ldiskfs" && return
3725 echo "The namespace LFSCK will move the orphans from backend"
3726 echo "/lost+found directory to normal client visible namespace"
3727 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3730 check_mount_and_prep
3732 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3733 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3735 echo "Inject failure stub on MDT0 to simulate the case that"
3736 echo "directory d0 has no linkEA entry, then the LFSCK will"
3737 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3739 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3741 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3742 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3744 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3745 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3747 echo "Inject failure stub on MDT0 to simulate the case that the"
3748 echo "object's name entry will be removed, but not destroy the"
3749 echo "object. Then backend e2fsck will handle it as orphan and"
3750 echo "add them into the backend /lost+found directory."
3752 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3753 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3754 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3755 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3756 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3757 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3760 umount_client $MOUNT || error "(10) Fail to stop client!"
3762 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3765 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3766 error "(12) Fail to run e2fsck"
3768 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3769 error "(13) Fail to start MDT0"
3771 echo "Trigger namespace LFSCK to recover backend orphans"
3772 $START_NAMESPACE -r -A ||
3773 error "(14) Fail to start LFSCK for namespace"
3775 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3776 mdd.${MDT_DEV}.lfsck_namespace |
3777 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3779 error "(15) unexpected status"
3782 local repaired=$($SHOW_NAMESPACE |
3783 awk '/^local_lost_found_moved/ { print $2 }')
3784 [ $repaired -ge 4 ] ||
3785 error "(16) Fail to recover backend orphans: $repaired"
3787 mount_client $MOUNT || error "(17) Fail to start client!"
3789 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3791 ls -ail $MOUNT/.lustre/lost+found/
3793 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3794 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3795 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3797 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3799 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3800 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3802 stat ${cname}/d1 || error "(21) d0 is not recovered"
3803 stat ${cname}/f1 || error "(22) f1 is not recovered"
3805 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3808 [ $MDSCOUNT -lt 2 ] &&
3809 skip "The test needs at least 2 MDTs" && return
3812 echo "For the name entry under a striped directory, if the name"
3813 echo "hash does not match the shard, then the LFSCK will repair"
3814 echo "the bad name entry"
3817 check_mount_and_prep
3819 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3820 error "(1) Fail to create striped directory"
3822 echo "Inject failure stub on client to simulate the case that"
3823 echo "some name entry should be inserted into other non-first"
3824 echo "shard, but inserted into the first shard by wrong"
3826 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3827 $LCTL set_param fail_loc=0x1628 fail_val=0
3828 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3829 error "(2) Fail to create file under striped directory"
3830 $LCTL set_param fail_loc=0 fail_val=0
3832 echo "Trigger namespace LFSCK to repair bad name hash"
3833 $START_NAMESPACE -r -A ||
3834 error "(3) Fail to start LFSCK for namespace"
3836 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3837 mdd.${MDT_DEV}.lfsck_namespace |
3838 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3840 error "(4) unexpected status"
3843 local repaired=$($SHOW_NAMESPACE |
3844 awk '/^name_hash_repaired/ { print $2 }')
3845 [ $repaired -ge 1 ] ||
3846 error "(5) Fail to repair bad name hash: $repaired"
3848 umount_client $MOUNT || error "(6) umount failed"
3849 mount_client $MOUNT || error "(7) mount failed"
3851 for ((i = 0; i < $MDSCOUNT; i++)); do
3852 stat $DIR/$tdir/striped_dir/d$i ||
3853 error "(8) Fail to stat d$i after LFSCK"
3854 rmdir $DIR/$tdir/striped_dir/d$i ||
3855 error "(9) Fail to unlink d$i after LFSCK"
3858 rmdir $DIR/$tdir/striped_dir ||
3859 error "(10) Fail to remove the striped directory after LFSCK"
3861 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3864 [ $MDSCOUNT -lt 2 ] &&
3865 skip "The test needs at least 2 MDTs" && return
3868 echo "For the name entry under a striped directory, if the name"
3869 echo "hash does not match the shard, then the LFSCK will repair"
3870 echo "the bad name entry"
3873 check_mount_and_prep
3875 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3876 error "(1) Fail to create striped directory"
3878 echo "Inject failure stub on client to simulate the case that"
3879 echo "some name entry should be inserted into other non-second"
3880 echo "shard, but inserted into the secod shard by wrong"
3882 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3883 $LCTL set_param fail_loc=0x1628 fail_val=1
3884 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3885 error "(2) Fail to create file under striped directory"
3886 $LCTL set_param fail_loc=0 fail_val=0
3888 echo "Trigger namespace LFSCK to repair bad name hash"
3889 $START_NAMESPACE -r -A ||
3890 error "(3) Fail to start LFSCK for namespace"
3892 wait_update_facet mds2 "$LCTL get_param -n \
3893 mdd.$(facet_svc mds2).lfsck_namespace |
3894 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3895 error "(4) unexpected status"
3897 local repaired=$(do_facet mds2 $LCTL get_param -n \
3898 mdd.$(facet_svc mds2).lfsck_namespace |
3899 awk '/^name_hash_repaired/ { print $2 }')
3900 [ $repaired -ge 1 ] ||
3901 error "(5) Fail to repair bad name hash: $repaired"
3903 umount_client $MOUNT || error "(6) umount failed"
3904 mount_client $MOUNT || error "(7) mount failed"
3906 for ((i = 0; i < $MDSCOUNT; i++)); do
3907 stat $DIR/$tdir/striped_dir/d$i ||
3908 error "(8) Fail to stat d$i after LFSCK"
3909 rmdir $DIR/$tdir/striped_dir/d$i ||
3910 error "(9) Fail to unlink d$i after LFSCK"
3913 rmdir $DIR/$tdir/striped_dir ||
3914 error "(10) Fail to remove the striped directory after LFSCK"
3916 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3919 [ $MDSCOUNT -lt 2 ] &&
3920 skip "The test needs at least 2 MDTs" && return
3923 echo "For some reason, the master MDT-object of the striped directory"
3924 echo "may lost its master LMV EA. If nobody created files under the"
3925 echo "master directly after the master LMV EA lost, then the LFSCK"
3926 echo "should re-generate the master LMV EA."
3929 check_mount_and_prep
3931 echo "Inject failure stub on MDT0 to simulate the case that the"
3932 echo "master MDT-object of the striped directory lost the LMV EA."
3934 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3935 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3936 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3937 error "(1) Fail to create striped directory"
3938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3940 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3941 $START_NAMESPACE -r -A ||
3942 error "(2) Fail to start LFSCK for namespace"
3944 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3945 mdd.${MDT_DEV}.lfsck_namespace |
3946 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3948 error "(3) unexpected status"
3951 local repaired=$($SHOW_NAMESPACE |
3952 awk '/^striped_dirs_repaired/ { print $2 }')
3953 [ $repaired -eq 1 ] ||
3954 error "(4) Fail to re-generate master LMV EA: $repaired"
3956 umount_client $MOUNT || error "(5) umount failed"
3957 mount_client $MOUNT || error "(6) mount failed"
3959 local empty=$(ls $DIR/$tdir/striped_dir/)
3960 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3962 rmdir $DIR/$tdir/striped_dir ||
3963 error "(8) Fail to remove the striped directory after LFSCK"
3965 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3968 [ $MDSCOUNT -lt 2 ] &&
3969 skip "The test needs at least 2 MDTs" && return
3972 echo "For some reason, the master MDT-object of the striped directory"
3973 echo "may lost its master LMV EA. If somebody created files under the"
3974 echo "master directly after the master LMV EA lost, then the LFSCK"
3975 echo "should NOT re-generate the master LMV EA, instead, it should"
3976 echo "change the broken striped dirctory as read-only to prevent"
3977 echo "further damage"
3980 check_mount_and_prep
3982 echo "Inject failure stub on MDT0 to simulate the case that the"
3983 echo "master MDT-object of the striped directory lost the LMV EA."
3985 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3986 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3987 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3988 error "(1) Fail to create striped directory"
3989 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
3991 umount_client $MOUNT || error "(2) umount failed"
3992 mount_client $MOUNT || error "(3) mount failed"
3994 touch $DIR/$tdir/striped_dir/dummy ||
3995 error "(4) Fail to touch under broken striped directory"
3997 echo "Trigger namespace LFSCK to find out the inconsistency"
3998 $START_NAMESPACE -r -A ||
3999 error "(5) Fail to start LFSCK for namespace"
4001 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4002 mdd.${MDT_DEV}.lfsck_namespace |
4003 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4005 error "(6) unexpected status"
4008 local repaired=$($SHOW_NAMESPACE |
4009 awk '/^striped_dirs_repaired/ { print $2 }')
4010 [ $repaired -eq 0 ] ||
4011 error "(7) Re-generate master LMV EA unexpected: $repaired"
4013 stat $DIR/$tdir/striped_dir/dummy ||
4014 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4016 touch $DIR/$tdir/striped_dir/foo &&
4017 error "(9) The broken striped directory should be read-only"
4019 chattr -i $DIR/$tdir/striped_dir ||
4020 error "(10) Fail to chattr on the broken striped directory"
4022 rmdir $DIR/$tdir/striped_dir ||
4023 error "(11) Fail to remove the striped directory after LFSCK"
4025 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4028 [ $MDSCOUNT -lt 2 ] &&
4029 skip "The test needs at least 2 MDTs" && return
4032 echo "For some reason, the slave MDT-object of the striped directory"
4033 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4034 echo "slave LMV EA."
4037 check_mount_and_prep
4039 echo "Inject failure stub on MDT0 to simulate the case that the"
4040 echo "slave MDT-object (that resides on the same MDT as the master"
4041 echo "MDT-object resides on) lost the LMV EA."
4043 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4045 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4046 error "(1) Fail to create striped directory"
4047 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4049 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4050 $START_NAMESPACE -r -A ||
4051 error "(2) Fail to start LFSCK for namespace"
4053 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4054 mdd.${MDT_DEV}.lfsck_namespace |
4055 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4057 error "(3) unexpected status"
4060 local repaired=$($SHOW_NAMESPACE |
4061 awk '/^striped_shards_repaired/ { print $2 }')
4062 [ $repaired -eq 1 ] ||
4063 error "(4) Fail to re-generate slave LMV EA: $repaired"
4065 rmdir $DIR/$tdir/striped_dir ||
4066 error "(5) Fail to remove the striped directory after LFSCK"
4068 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4071 [ $MDSCOUNT -lt 2 ] &&
4072 skip "The test needs at least 2 MDTs" && return
4075 echo "For some reason, the slave MDT-object of the striped directory"
4076 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4077 echo "slave LMV EA."
4080 check_mount_and_prep
4082 echo "Inject failure stub on MDT0 to simulate the case that the"
4083 echo "slave MDT-object (that resides on differnt MDT as the master"
4084 echo "MDT-object resides on) lost the LMV EA."
4086 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4088 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4089 error "(1) Fail to create striped directory"
4090 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4092 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4093 $START_NAMESPACE -r -A ||
4094 error "(2) Fail to start LFSCK for namespace"
4096 wait_update_facet mds2 "$LCTL get_param -n \
4097 mdd.$(facet_svc mds2).lfsck_namespace |
4098 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4099 error "(3) unexpected status"
4101 local repaired=$(do_facet mds2 $LCTL get_param -n \
4102 mdd.$(facet_svc mds2).lfsck_namespace |
4103 awk '/^striped_shards_repaired/ { print $2 }')
4104 [ $repaired -eq 1 ] ||
4105 error "(4) Fail to re-generate slave LMV EA: $repaired"
4107 rmdir $DIR/$tdir/striped_dir ||
4108 error "(5) Fail to remove the striped directory after LFSCK"
4110 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4113 [ $MDSCOUNT -lt 2 ] &&
4114 skip "The test needs at least 2 MDTs" && return
4117 echo "For some reason, the stripe index in the slave LMV EA is"
4118 echo "corrupted. The LFSCK should repair the slave LMV EA."
4121 check_mount_and_prep
4123 echo "Inject failure stub on MDT0 to simulate the case that the"
4124 echo "slave LMV EA on the first shard of the striped directory"
4125 echo "claims the same index as the second shard claims"
4127 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4128 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4129 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4130 error "(1) Fail to create striped directory"
4131 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4133 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4134 $START_NAMESPACE -r -A ||
4135 error "(2) Fail to start LFSCK for namespace"
4137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4138 mdd.${MDT_DEV}.lfsck_namespace |
4139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4141 error "(3) unexpected status"
4144 local repaired=$($SHOW_NAMESPACE |
4145 awk '/^striped_shards_repaired/ { print $2 }')
4146 [ $repaired -eq 1 ] ||
4147 error "(4) Fail to repair slave LMV EA: $repaired"
4149 umount_client $MOUNT || error "(5) umount failed"
4150 mount_client $MOUNT || error "(6) mount failed"
4152 touch $DIR/$tdir/striped_dir/foo ||
4153 error "(7) Fail to touch file after the LFSCK"
4155 rm -f $DIR/$tdir/striped_dir/foo ||
4156 error "(8) Fail to unlink file after the LFSCK"
4158 rmdir $DIR/$tdir/striped_dir ||
4159 error "(9) Fail to remove the striped directory after LFSCK"
4161 run_test 31g "Repair the corrupted slave LMV EA"
4164 [ $MDSCOUNT -lt 2 ] &&
4165 skip "The test needs at least 2 MDTs" && return
4168 echo "For some reason, the shard's name entry in the striped"
4169 echo "directory may be corrupted. The LFSCK should repair the"
4170 echo "bad shard's name entry."
4173 check_mount_and_prep
4175 echo "Inject failure stub on MDT0 to simulate the case that the"
4176 echo "first shard's name entry in the striped directory claims"
4177 echo "the same index as the second shard's name entry claims."
4179 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4180 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4181 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4182 error "(1) Fail to create striped directory"
4183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4185 echo "Trigger namespace LFSCK to repair the shard's name entry"
4186 $START_NAMESPACE -r -A ||
4187 error "(2) Fail to start LFSCK for namespace"
4189 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4190 mdd.${MDT_DEV}.lfsck_namespace |
4191 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4193 error "(3) unexpected status"
4196 local repaired=$($SHOW_NAMESPACE |
4197 awk '/^dirent_repaired/ { print $2 }')
4198 [ $repaired -eq 1 ] ||
4199 error "(4) Fail to repair shard's name entry: $repaired"
4201 umount_client $MOUNT || error "(5) umount failed"
4202 mount_client $MOUNT || error "(6) mount failed"
4204 touch $DIR/$tdir/striped_dir/foo ||
4205 error "(7) Fail to touch file after the LFSCK"
4207 rm -f $DIR/$tdir/striped_dir/foo ||
4208 error "(8) Fail to unlink file after the LFSCK"
4210 rmdir $DIR/$tdir/striped_dir ||
4211 error "(9) Fail to remove the striped directory after LFSCK"
4213 run_test 31h "Repair the corrupted shard's name entry"
4215 # restore MDS/OST size
4216 MDSSIZE=${SAVED_MDSSIZE}
4217 OSTSIZE=${SAVED_OSTSIZE}
4218 OSTCOUNT=${SAVED_OSTCOUNT}
4220 # cleanup the system at last