3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
24 SAVED_MDSSIZE=${MDSSIZE}
25 SAVED_OSTSIZE=${OSTSIZE}
26 SAVED_OSTCOUNT=${OSTCOUNT}
27 # use small MDS + OST size to speed formatting time
28 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
31 # no need too many OSTs, to reduce the format/start/stop overhead
32 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
34 # build up a clean test environment.
38 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
39 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
42 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
43 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
45 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
51 # DNE does not support striped directory on zfs-based backend yet.
52 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
53 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
57 MDT_DEV="${FSNAME}-MDT0000"
58 OST_DEV="${FSNAME}-OST0000"
59 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
60 START_NAMESPACE="do_facet $SINGLEMDS \
61 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
62 START_LAYOUT="do_facet $SINGLEMDS \
63 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
64 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
65 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
66 SHOW_NAMESPACE="do_facet $SINGLEMDS \
67 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
68 SHOW_LAYOUT="do_facet $SINGLEMDS \
69 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
70 SHOW_LAYOUT_ON_OST="do_facet ost1 \
71 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
72 MOUNT_OPTS_SCRUB="-o user_xattr"
73 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
82 echo "preparing... $nfiles * $ndirs files will be created $(date)."
83 if [ ! -z $igif ]; then
84 #define OBD_FAIL_FID_IGIF 0x1504
85 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
88 cp $LUSTRE/tests/*.sh $DIR/$tdir/
89 if [ $ndirs -gt 0 ]; then
90 createmany -d $DIR/$tdir/d $ndirs
91 createmany -m $DIR/$tdir/f $ndirs
92 if [ $nfiles -gt 0 ]; then
93 for ((i = 0; i < $ndirs; i++)); do
94 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
95 /dev/null || error "createmany $nfiles"
98 createmany -d $DIR/$tdir/e $ndirs
101 if [ ! -z $igif ]; then
102 touch $DIR/$tdir/dummy
103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
106 echo "prepared $(date)."
112 #define OBD_FAIL_LFSCK_DELAY1 0x1600
113 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
114 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
116 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
118 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
119 [ "$STATUS" == "scanning-phase1" ] ||
120 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
122 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
124 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
125 [ "$STATUS" == "stopped" ] ||
126 error "(6) Expect 'stopped', but got '$STATUS'"
128 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
130 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
131 [ "$STATUS" == "scanning-phase1" ] ||
132 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
134 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
135 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
136 mdd.${MDT_DEV}.lfsck_namespace |
137 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
139 error "(9) unexpected status"
142 local repaired=$($SHOW_NAMESPACE |
143 awk '/^updated_phase1/ { print $2 }')
144 [ $repaired -eq 0 ] ||
145 error "(10) Expect nothing to be repaired, but got: $repaired"
147 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
148 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
149 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
150 mdd.${MDT_DEV}.lfsck_namespace |
151 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
153 error "(12) unexpected status"
156 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
157 [ $((scanned1 + 1)) -eq $scanned2 ] ||
158 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
160 echo "stopall, should NOT crash LU-3649"
161 stopall || error "(14) Fail to stopall"
163 run_test 0 "Control LFSCK manually"
166 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
167 skip "OI Scrub not implemented for ZFS" && return
171 #define OBD_FAIL_FID_INDIR 0x1501
172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
173 touch $DIR/$tdir/dummy
175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
177 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
178 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
179 mdd.${MDT_DEV}.lfsck_namespace |
180 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
182 error "(4) unexpected status"
185 local repaired=$($SHOW_NAMESPACE |
186 awk '/^dirent_repaired/ { print $2 }')
187 # for interop with old server
188 [ -z "$repaired" ] &&
189 repaired=$($SHOW_NAMESPACE |
190 awk '/^updated_phase1/ { print $2 }')
192 [ $repaired -eq 1 ] ||
193 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
195 mount_client $MOUNT || error "(6) Fail to start client!"
197 #define OBD_FAIL_FID_LOOKUP 0x1505
198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
199 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
201 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
203 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
207 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
208 skip "OI Scrub not implemented for ZFS" && return
212 #define OBD_FAIL_FID_INLMA 0x1502
213 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
214 touch $DIR/$tdir/dummy
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
218 #define OBD_FAIL_FID_NOLMA 0x1506
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
220 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
221 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
222 mdd.${MDT_DEV}.lfsck_namespace |
223 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
225 error "(4) unexpected status"
228 local repaired=$($SHOW_NAMESPACE |
229 awk '/^dirent_repaired/ { print $2 }')
230 # for interop with old server
231 [ -z "$repaired" ] &&
232 repaired=$($SHOW_NAMESPACE |
233 awk '/^updated_phase1/ { print $2 }')
235 [ $repaired -eq 1 ] ||
236 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
239 mount_client $MOUNT || error "(6) Fail to start client!"
241 #define OBD_FAIL_FID_LOOKUP 0x1505
242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
243 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
247 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
252 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
254 touch $DIR/$tdir/dummy
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
258 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
259 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
260 mdd.${MDT_DEV}.lfsck_namespace |
261 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
263 error "(4) unexpected status"
266 local repaired=$($SHOW_NAMESPACE |
267 awk '/^linkea_repaired/ { print $2 }')
268 # for interop with old server
269 [ -z "$repaired" ] &&
270 repaired=$($SHOW_NAMESPACE |
271 awk '/^updated_phase2/ { print $2 }')
273 [ $repaired -eq 1 ] ||
274 error "(5) Fail to repair crashed linkEA: $repaired"
276 mount_client $MOUNT || error "(6) Fail to start client!"
278 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
279 error "(7) Fail to stat $DIR/$tdir/dummy"
281 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
282 local dummyname=$($LFS fid2path $DIR $dummyfid)
283 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
284 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
286 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
292 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
293 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
294 touch $DIR/$tdir/dummy
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
298 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
299 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
300 mdd.${MDT_DEV}.lfsck_namespace |
301 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
303 error "(4) unexpected status"
306 local repaired=$($SHOW_NAMESPACE |
307 awk '/^updated_phase2/ { print $2 }')
308 [ $repaired -eq 1 ] ||
309 error "(5) Fail to repair crashed linkEA: $repaired"
311 mount_client $MOUNT || error "(6) Fail to start client!"
313 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
314 error "(7) Fail to stat $DIR/$tdir/dummy"
316 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
317 local dummyname=$($LFS fid2path $DIR $dummyfid)
318 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
319 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
321 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
327 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
328 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
329 touch $DIR/$tdir/dummy
331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
333 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
334 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
335 mdd.${MDT_DEV}.lfsck_namespace |
336 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
338 error "(4) unexpected status"
341 local repaired=$($SHOW_NAMESPACE |
342 awk '/^updated_phase2/ { print $2 }')
343 [ $repaired -eq 1 ] ||
344 error "(5) Fail to repair crashed linkEA: $repaired"
346 mount_client $MOUNT || error "(6) Fail to start client!"
348 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
349 error "(7) Fail to stat $DIR/$tdir/dummy"
351 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
352 local dummyname=$($LFS fid2path $DIR $dummyfid)
353 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
354 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
356 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
362 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
364 touch $DIR/$tdir/dummy
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
368 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
369 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
370 mdd.${MDT_DEV}.lfsck_namespace |
371 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
373 error "(4) unexpected status"
376 local repaired=$($SHOW_NAMESPACE |
377 awk '/^linkea_repaired/ { print $2 }')
378 [ $repaired -eq 1 ] ||
379 error "(5) Fail to repair crashed linkEA: $repaired"
381 mount_client $MOUNT || error "(6) Fail to start client!"
383 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
384 error "(7) Fail to stat $DIR/$tdir/dummy"
386 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
387 local dummyname=$($LFS fid2path $DIR $dummyfid)
388 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
389 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
391 run_test 2d "LFSCK can recover the missing linkEA entry"
395 [ $MDSCOUNT -lt 2 ] &&
396 skip "We need at least 2 MDSes for this test" && return
400 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
402 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
404 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
407 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
408 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
409 mdd.${MDT_DEV}.lfsck_namespace |
410 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
412 error "(4) unexpected status"
415 local repaired=$($SHOW_NAMESPACE |
416 awk '/^linkea_repaired/ { print $2 }')
417 [ $repaired -eq 1 ] ||
418 error "(5) Fail to repair crashed linkEA: $repaired"
420 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
421 local name=$($LFS fid2path $DIR $fid)
422 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
423 error "(6) Fail to repair linkEA: $fid $name"
425 run_test 2e "namespace LFSCK can verify remote object linkEA"
431 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
432 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
433 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
435 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
436 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
437 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
439 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
441 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
443 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
445 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
449 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
450 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
451 mdd.${MDT_DEV}.lfsck_namespace |
452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
454 error "(10) unexpected status"
457 local checked=$($SHOW_NAMESPACE |
458 awk '/^checked_phase2/ { print $2 }')
459 [ $checked -ge 4 ] ||
460 error "(11) Fail to check multiple-linked object: $checked"
462 local repaired=$($SHOW_NAMESPACE |
463 awk '/^multiple_linked_repaired/ { print $2 }')
464 [ $repaired -ge 2 ] ||
465 error "(12) Fail to repair multiple-linked object: $repaired"
467 run_test 3 "LFSCK can verify multiple-linked objects"
471 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
472 skip "OI Scrub not implemented for ZFS" && return
475 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
476 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
478 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
479 echo "start $SINGLEMDS with disabling OI scrub"
480 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
481 error "(2) Fail to start MDS!"
483 #define OBD_FAIL_LFSCK_DELAY2 0x1601
484 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
485 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
486 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
487 mdd.${MDT_DEV}.lfsck_namespace |
488 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
490 error "(5) unexpected status"
493 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
494 [ "$STATUS" == "scanning-phase1" ] ||
495 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
498 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
499 mdd.${MDT_DEV}.lfsck_namespace |
500 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
502 error "(7) unexpected status"
505 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
506 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
508 local repaired=$($SHOW_NAMESPACE |
509 awk '/^dirent_repaired/ { print $2 }')
510 # for interop with old server
511 [ -z "$repaired" ] &&
512 repaired=$($SHOW_NAMESPACE |
513 awk '/^updated_phase1/ { print $2 }')
515 [ $repaired -ge 9 ] ||
516 error "(9) Fail to re-generate FID-in-dirent: $repaired"
518 mount_client $MOUNT || error "(10) Fail to start client!"
520 #define OBD_FAIL_FID_LOOKUP 0x1505
521 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
522 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
525 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 2 ] ||
574 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
576 mount_client $MOUNT || error "(10) Fail to start client!"
578 #define OBD_FAIL_FID_LOOKUP 0x1505
579 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
580 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
582 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
586 local dummyname=$($LFS fid2path $DIR $dummyfid)
587 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
588 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
590 run_test 5 "LFSCK can handle IGIF object upgrading"
595 #define OBD_FAIL_LFSCK_DELAY1 0x1600
596 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
597 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
599 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
600 [ "$STATUS" == "scanning-phase1" ] ||
601 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
603 # Sleep 3 sec to guarantee at least one object processed by LFSCK
605 # Fail the LFSCK to guarantee there is at least one checkpoint
606 #define OBD_FAIL_LFSCK_FATAL1 0x1608
607 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
608 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
609 mdd.${MDT_DEV}.lfsck_namespace |
610 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
612 error "(4) unexpected status"
615 local POS0=$($SHOW_NAMESPACE |
616 awk '/^last_checkpoint_position/ { print $2 }' |
619 #define OBD_FAIL_LFSCK_DELAY1 0x1600
620 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
621 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
623 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
624 [ "$STATUS" == "scanning-phase1" ] ||
625 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
627 local POS1=$($SHOW_NAMESPACE |
628 awk '/^latest_start_position/ { print $2 }' |
630 [[ $POS0 -lt $POS1 ]] ||
631 error "(7) Expect larger than: $POS0, but got $POS1"
633 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
634 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
635 mdd.${MDT_DEV}.lfsck_namespace |
636 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
638 error "(8) unexpected status"
641 run_test 6a "LFSCK resumes from last checkpoint (1)"
646 #define OBD_FAIL_LFSCK_DELAY2 0x1601
647 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
648 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
650 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
651 [ "$STATUS" == "scanning-phase1" ] ||
652 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
654 # Sleep 5 sec to guarantee that we are in the directory scanning
656 # Fail the LFSCK to guarantee there is at least one checkpoint
657 #define OBD_FAIL_LFSCK_FATAL2 0x1609
658 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
659 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
660 mdd.${MDT_DEV}.lfsck_namespace |
661 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
663 error "(4) unexpected status"
666 local O_POS0=$($SHOW_NAMESPACE |
667 awk '/^last_checkpoint_position/ { print $2 }' |
670 local D_POS0=$($SHOW_NAMESPACE |
671 awk '/^last_checkpoint_position/ { print $4 }')
673 #define OBD_FAIL_LFSCK_DELAY2 0x1601
674 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
675 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
677 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
678 [ "$STATUS" == "scanning-phase1" ] ||
679 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
681 local O_POS1=$($SHOW_NAMESPACE |
682 awk '/^latest_start_position/ { print $2 }' |
684 local D_POS1=$($SHOW_NAMESPACE |
685 awk '/^latest_start_position/ { print $4 }')
687 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
688 [[ $O_POS0 -lt $O_POS1 ]] ||
689 error "(7.1) $O_POS1 is not larger than $O_POS0"
691 [[ $D_POS0 -lt $D_POS1 ]] ||
692 error "(7.2) $D_POS1 is not larger than $D_POS0"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6b "LFSCK resumes from last checkpoint (2)"
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
714 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
715 [ "$STATUS" == "scanning-phase1" ] ||
716 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
718 # Sleep 3 sec to guarantee at least one object processed by LFSCK
720 echo "stop $SINGLEMDS"
721 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
724 echo "start $SINGLEMDS"
725 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
726 error "(5) Fail to start MDS!"
728 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
729 mdd.${MDT_DEV}.lfsck_namespace |
730 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
732 error "(6) unexpected status"
735 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
741 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
742 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
743 for ((i = 0; i < 20; i++)); do
744 touch $DIR/$tdir/dummy${i}
747 #define OBD_FAIL_LFSCK_DELAY3 0x1602
748 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
749 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
750 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
751 mdd.${MDT_DEV}.lfsck_namespace |
752 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
754 error "(4) unexpected status"
758 echo "stop $SINGLEMDS"
759 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
762 echo "start $SINGLEMDS"
763 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
764 error "(6) Fail to start MDS!"
766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
767 mdd.${MDT_DEV}.lfsck_namespace |
768 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
770 error "(7) unexpected status"
773 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
778 formatall > /dev/null
784 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
785 [ "$STATUS" == "init" ] ||
786 error "(2) Expect 'init', but got '$STATUS'"
788 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
789 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
790 mkdir $DIR/$tdir/crashed
792 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
794 for ((i = 0; i < 5; i++)); do
795 touch $DIR/$tdir/dummy${i}
798 umount_client $MOUNT || error "(3) Fail to stop client!"
800 #define OBD_FAIL_LFSCK_DELAY2 0x1601
801 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
802 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
804 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
805 [ "$STATUS" == "scanning-phase1" ] ||
806 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
808 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
810 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "stopped" ] ||
812 error "(7) Expect 'stopped', but got '$STATUS'"
814 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
816 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
817 [ "$STATUS" == "scanning-phase1" ] ||
818 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
820 #define OBD_FAIL_LFSCK_FATAL2 0x1609
821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
822 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
823 mdd.${MDT_DEV}.lfsck_namespace |
824 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
826 error "(10) unexpected status"
829 #define OBD_FAIL_LFSCK_DELAY1 0x1600
830 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
831 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
833 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
834 [ "$STATUS" == "scanning-phase1" ] ||
835 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
837 #define OBD_FAIL_LFSCK_CRASH 0x160a
838 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
841 echo "stop $SINGLEMDS"
842 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
844 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
847 echo "start $SINGLEMDS"
848 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
849 error "(14) Fail to start MDS!"
851 local timeout=$(max_recovery_time)
854 while [ $timer -lt $timeout ]; do
855 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
856 mdt.${MDT_DEV}.recovery_status |
857 awk '/^status/ { print \\\$2 }'")
858 [ "$STATUS" != "RECOVERING" ] && break;
863 [ $timer != $timeout ] ||
864 error "(14.1) recovery timeout"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "crashed" ] ||
868 error "(15) Expect 'crashed', but got '$STATUS'"
870 #define OBD_FAIL_LFSCK_DELAY2 0x1601
871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
872 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
874 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
875 [ "$STATUS" == "scanning-phase1" ] ||
876 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
878 echo "stop $SINGLEMDS"
879 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
881 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
884 echo "start $SINGLEMDS"
885 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
886 error "(19) Fail to start MDS!"
889 while [ $timer -lt $timeout ]; do
890 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
891 mdt.${MDT_DEV}.recovery_status |
892 awk '/^status/ { print \\\$2 }'")
893 [ "$STATUS" != "RECOVERING" ] && break;
898 [ $timer != $timeout ] ||
899 error "(19.1) recovery timeout"
901 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
902 [ "$STATUS" == "paused" ] ||
903 error "(20) Expect 'paused', but got '$STATUS'"
905 #define OBD_FAIL_LFSCK_DELAY3 0x1602
906 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
908 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
909 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
910 mdd.${MDT_DEV}.lfsck_namespace |
911 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
913 error "(22) unexpected status"
916 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
917 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
918 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
920 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
921 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
922 mdd.${MDT_DEV}.lfsck_namespace |
923 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
925 error "(24) unexpected status"
928 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
929 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
931 run_test 8 "LFSCK state machine"
934 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
935 skip "Testing on UP system, the speed may be inaccurate."
940 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
941 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
942 createmany -o $DIR/$tdir/lfsck/f 5000
944 local BASE_SPEED1=100
946 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
949 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
950 [ "$STATUS" == "scanning-phase1" ] ||
951 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
953 local SPEED=$($SHOW_LAYOUT |
954 awk '/^average_speed_phase1/ { print $2 }')
956 # There may be time error, normally it should be less than 2 seconds.
957 # We allow another 20% schedule error.
959 # MAX_MARGIN = 1.2 = 12 / 10
960 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
961 RUN_TIME1 * 12 / 10))
962 [ $SPEED -lt $MAX_SPEED ] ||
963 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
966 local BASE_SPEED2=300
968 do_facet $SINGLEMDS \
969 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
972 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
973 # MIN_MARGIN = 0.8 = 8 / 10
974 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
975 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
976 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
977 [ $SPEED -gt $MIN_SPEED ] || {
978 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
979 error_ignore LU-5624 \
980 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
983 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
987 # MAX_MARGIN = 1.2 = 12 / 10
988 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
989 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
990 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
991 [ $SPEED -lt $MAX_SPEED ] ||
992 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
994 do_facet $SINGLEMDS \
995 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
997 wait_update_facet $SINGLEMDS \
998 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
999 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1000 error "(7) Failed to get expected 'completed'"
1002 run_test 9a "LFSCK speed control (1)"
1005 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1006 skip "Testing on UP system, the speed may be inaccurate."
1012 echo "Preparing another 50 * 50 files (with error) at $(date)."
1013 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1014 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1015 createmany -d $DIR/$tdir/d 50
1016 createmany -m $DIR/$tdir/f 50
1017 for ((i = 0; i < 50; i++)); do
1018 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1021 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1023 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1024 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1025 mdd.${MDT_DEV}.lfsck_namespace |
1026 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1028 error "(5) unexpected status"
1031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1032 echo "Prepared at $(date)."
1034 local BASE_SPEED1=50
1036 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1039 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1040 [ "$STATUS" == "scanning-phase2" ] ||
1041 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1043 local SPEED=$($SHOW_NAMESPACE |
1044 awk '/^average_speed_phase2/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=150
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1085 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1086 mdd.${MDT_DEV}.lfsck_namespace |
1087 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1089 error "(11) unexpected status"
1092 run_test 9b "LFSCK speed control (2)"
1096 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1097 skip "lookup(..)/linkea on ZFS issue" && return
1101 echo "Preparing more files with error at $(date)."
1102 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1105 for ((i = 0; i < 1000; i = $((i+2)))); do
1106 mkdir -p $DIR/$tdir/d${i}
1107 touch $DIR/$tdir/f${i}
1108 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1111 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1112 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1114 for ((i = 1; i < 1000; i = $((i+2)))); do
1115 mkdir -p $DIR/$tdir/d${i}
1116 touch $DIR/$tdir/f${i}
1117 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1120 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1121 echo "Prepared at $(date)."
1123 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1125 umount_client $MOUNT
1126 mount_client $MOUNT || error "(3) Fail to start client!"
1128 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase1" ] ||
1133 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1135 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1137 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1139 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1141 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1143 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1145 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1147 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1149 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1150 error "(14) Fail to softlink!"
1152 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1153 [ "$STATUS" == "scanning-phase1" ] ||
1154 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1156 do_facet $SINGLEMDS \
1157 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1158 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1159 mdd.${MDT_DEV}.lfsck_namespace |
1160 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1162 error "(16) unexpected status"
1165 run_test 10 "System is available during LFSCK scanning"
1168 ost_remove_lastid() {
1171 local rcmd="do_facet ost${ost}"
1173 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1175 # step 1: local mount
1176 mount_fstype ost${ost} || return 1
1177 # step 2: remove the specified LAST_ID
1178 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1180 unmount_fstype ost${ost} || return 2
1184 check_mount_and_prep
1185 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1186 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1191 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1193 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1194 error "(2) Fail to start ost1"
1196 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1197 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1199 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1200 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1202 wait_update_facet ost1 "$LCTL get_param -n \
1203 obdfilter.${OST_DEV}.lfsck_layout |
1204 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1206 error "(5) unexpected status"
1209 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1211 wait_update_facet ost1 "$LCTL get_param -n \
1212 obdfilter.${OST_DEV}.lfsck_layout |
1213 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1215 error "(6) unexpected status"
1218 echo "the LAST_ID(s) should have been rebuilt"
1219 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1220 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1222 run_test 11a "LFSCK can rebuild lost last_id"
1225 check_mount_and_prep
1226 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1228 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1229 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1230 do_facet ost1 $LCTL set_param fail_loc=0x160d
1232 local count=$(precreated_ost_obj_count 0 0)
1234 createmany -o $DIR/$tdir/f $((count + 32))
1236 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1237 local seq=$(do_facet mds1 $LCTL get_param -n \
1238 osp.${proc_path}.prealloc_last_seq)
1239 local lastid1=$(do_facet ost1 "lctl get_param -n \
1240 obdfilter.${ost1_svc}.last_id" | grep $seq |
1241 awk -F: '{ print $2 }')
1243 umount_client $MOUNT
1244 stop ost1 || error "(1) Fail to stop ost1"
1246 #define OBD_FAIL_OST_ENOSPC 0x215
1247 do_facet ost1 $LCTL set_param fail_loc=0x215
1249 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1250 error "(2) Fail to start ost1"
1252 for ((i = 0; i < 60; i++)); do
1253 lastid2=$(do_facet ost1 "lctl get_param -n \
1254 obdfilter.${ost1_svc}.last_id" | grep $seq |
1255 awk -F: '{ print $2 }')
1256 [ ! -z $lastid2 ] && break;
1260 echo "the on-disk LAST_ID should be smaller than the expected one"
1261 [ $lastid1 -gt $lastid2 ] ||
1262 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1264 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1265 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1267 wait_update_facet ost1 "$LCTL get_param -n \
1268 obdfilter.${OST_DEV}.lfsck_layout |
1269 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1271 error "(6) unexpected status"
1274 stop ost1 || error "(7) Fail to stop ost1"
1276 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1277 error "(8) Fail to start ost1"
1279 echo "the on-disk LAST_ID should have been rebuilt"
1280 wait_update_facet ost1 "$LCTL get_param -n \
1281 obdfilter.${ost1_svc}.last_id | grep $seq |
1282 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1283 do_facet ost1 $LCTL get_param -n \
1284 obdfilter.${ost1_svc}.last_id
1285 error "(9) expect lastid1 $seq:$lastid1"
1288 do_facet ost1 $LCTL set_param fail_loc=0
1289 stopall || error "(10) Fail to stopall"
1291 run_test 11b "LFSCK can rebuild crashed last_id"
1294 [ $MDSCOUNT -lt 2 ] &&
1295 skip "We need at least 2 MDSes for test_12" && return
1297 check_mount_and_prep
1298 for k in $(seq $MDSCOUNT); do
1299 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1300 createmany -o $DIR/$tdir/${k}/f 100 ||
1301 error "(0) Fail to create 100 files."
1304 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1305 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1306 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1308 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1309 for k in $(seq $MDSCOUNT); do
1310 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1311 mdd.$(facet_svc mds${k}).lfsck_namespace |
1312 awk '/^status/ { print $2 }')
1313 [ "$STATUS" == "scanning-phase1" ] ||
1314 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1317 echo "Stop namespace LFSCK on all targets by single lctl command."
1318 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1319 error "(4) Fail to stop LFSCK on all devices!"
1321 echo "All the LFSCK targets should be in 'stopped' status."
1322 for k in $(seq $MDSCOUNT); do
1323 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1324 mdd.$(facet_svc mds${k}).lfsck_namespace |
1325 awk '/^status/ { print $2 }')
1326 [ "$STATUS" == "stopped" ] ||
1327 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1330 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1331 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1332 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1334 echo "All the LFSCK targets should be in 'completed' status."
1335 for k in $(seq $MDSCOUNT); do
1336 wait_update_facet mds${k} "$LCTL get_param -n \
1337 mdd.$(facet_svc mds${k}).lfsck_namespace |
1338 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1339 error "(7) MDS${k} is not the expected 'completed'"
1342 start_full_debug_logging
1344 echo "Start layout LFSCK on all targets by single command (-s 1)."
1345 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1346 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1348 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1349 for k in $(seq $MDSCOUNT); do
1350 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1351 mdd.$(facet_svc mds${k}).lfsck_layout |
1352 awk '/^status/ { print $2 }')
1353 [ "$STATUS" == "scanning-phase1" ] ||
1354 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1357 echo "Stop layout LFSCK on all targets by single lctl command."
1358 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1359 error "(10) Fail to stop LFSCK on all devices!"
1361 echo "All the LFSCK targets should be in 'stopped' status."
1362 for k in $(seq $MDSCOUNT); do
1363 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1364 mdd.$(facet_svc mds${k}).lfsck_layout |
1365 awk '/^status/ { print $2 }')
1366 [ "$STATUS" == "stopped" ] ||
1367 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1370 for k in $(seq $OSTCOUNT); do
1371 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1372 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1373 awk '/^status/ { print $2 }')
1374 [ "$STATUS" == "stopped" ] ||
1375 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1378 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1379 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1380 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1382 echo "All the LFSCK targets should be in 'completed' status."
1383 for k in $(seq $MDSCOUNT); do
1384 # The LFSCK status query internal is 30 seconds. For the case
1385 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1386 # time to guarantee the status sync up.
1387 wait_update_facet mds${k} "$LCTL get_param -n \
1388 mdd.$(facet_svc mds${k}).lfsck_layout |
1389 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1390 error "(14) MDS${k} is not the expected 'completed'"
1393 stop_full_debug_logging
1395 run_test 12 "single command to trigger LFSCK on all devices"
1399 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1400 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1401 echo "MDT-object FID."
1404 check_mount_and_prep
1406 echo "Inject failure stub to simulate bad lmm_oi"
1407 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1409 createmany -o $DIR/$tdir/f 32
1410 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1412 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1413 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1415 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1416 mdd.${MDT_DEV}.lfsck_layout |
1417 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1419 error "(2) unexpected status"
1422 local repaired=$($SHOW_LAYOUT |
1423 awk '/^repaired_others/ { print $2 }')
1424 [ $repaired -eq 32 ] ||
1425 error "(3) Fail to repair crashed lmm_oi: $repaired"
1427 run_test 13 "LFSCK can repair crashed lmm_oi"
1431 echo "The OST-object referenced by the MDT-object should be there;"
1432 echo "otherwise, the LFSCK should re-create the missing OST-object."
1435 check_mount_and_prep
1436 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1438 echo "Inject failure stub to simulate dangling referenced MDT-object"
1439 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1440 do_facet ost1 $LCTL set_param fail_loc=0x1610
1441 local count=$(precreated_ost_obj_count 0 0)
1443 createmany -o $DIR/$tdir/f $((count + 31))
1444 touch $DIR/$tdir/guard
1445 do_facet ost1 $LCTL set_param fail_loc=0
1447 start_full_debug_logging
1449 # exhaust other pre-created dangling cases
1450 count=$(precreated_ost_obj_count 0 0)
1451 createmany -o $DIR/$tdir/a $count ||
1452 error "(0) Fail to create $count files."
1454 echo "'ls' should fail because of dangling referenced MDT-object"
1455 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1457 echo "Trigger layout LFSCK to find out dangling reference"
1458 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1460 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1461 mdd.${MDT_DEV}.lfsck_layout |
1462 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1464 error "(3) unexpected status"
1467 local repaired=$($SHOW_LAYOUT |
1468 awk '/^repaired_dangling/ { print $2 }')
1469 [ $repaired -ge 32 ] ||
1470 error "(4) Fail to repair dangling reference: $repaired"
1472 echo "'stat' should fail because of not repair dangling by default"
1473 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1475 echo "Trigger layout LFSCK to repair dangling reference"
1476 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1478 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1479 mdd.${MDT_DEV}.lfsck_layout |
1480 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1482 error "(7) unexpected status"
1485 # There may be some async LFSCK updates in processing, wait for
1486 # a while until the target reparation has been done. LU-4970.
1488 echo "'stat' should success after layout LFSCK repairing"
1489 wait_update_facet client "stat $DIR/$tdir/guard |
1490 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1491 stat $DIR/$tdir/guard
1493 error "(8) unexpected size"
1496 repaired=$($SHOW_LAYOUT |
1497 awk '/^repaired_dangling/ { print $2 }')
1498 [ $repaired -ge 32 ] ||
1499 error "(9) Fail to repair dangling reference: $repaired"
1501 stop_full_debug_logging
1503 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1507 echo "If the OST-object referenced by the MDT-object back points"
1508 echo "to some non-exist MDT-object, then the LFSCK should repair"
1509 echo "the OST-object to back point to the right MDT-object."
1512 check_mount_and_prep
1513 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1515 echo "Inject failure stub to make the OST-object to back point to"
1516 echo "non-exist MDT-object."
1517 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1519 do_facet ost1 $LCTL set_param fail_loc=0x1611
1520 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1521 cancel_lru_locks osc
1522 do_facet ost1 $LCTL set_param fail_loc=0
1524 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1525 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1527 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1528 mdd.${MDT_DEV}.lfsck_layout |
1529 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1531 error "(2) unexpected status"
1534 local repaired=$($SHOW_LAYOUT |
1535 awk '/^repaired_unmatched_pair/ { print $2 }')
1536 [ $repaired -eq 1 ] ||
1537 error "(3) Fail to repair unmatched pair: $repaired"
1539 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1543 echo "If the OST-object referenced by the MDT-object back points"
1544 echo "to other MDT-object that doesn't recognize the OST-object,"
1545 echo "then the LFSCK should repair it to back point to the right"
1546 echo "MDT-object (the first one)."
1549 check_mount_and_prep
1550 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1551 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1552 cancel_lru_locks osc
1554 echo "Inject failure stub to make the OST-object to back point to"
1555 echo "other MDT-object"
1557 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1558 do_facet ost1 $LCTL set_param fail_loc=0x1612
1559 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1560 cancel_lru_locks osc
1561 do_facet ost1 $LCTL set_param fail_loc=0
1563 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1564 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1566 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1567 mdd.${MDT_DEV}.lfsck_layout |
1568 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1570 error "(2) unexpected status"
1573 local repaired=$($SHOW_LAYOUT |
1574 awk '/^repaired_unmatched_pair/ { print $2 }')
1575 [ $repaired -eq 1 ] ||
1576 error "(3) Fail to repair unmatched pair: $repaired"
1578 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1581 [ $MDSCOUNT -lt 2 ] &&
1582 skip "We need at least 2 MDSes for this test" && return
1585 echo "According to current metadata migration implementation,"
1586 echo "before the old MDT-object is removed, both the new MDT-object"
1587 echo "and old MDT-object will reference the same LOV layout. Then if"
1588 echo "the layout LFSCK finds the new MDT-object by race, it will"
1589 echo "regard related OST-object(s) as multiple referenced case, and"
1590 echo "will try to create new OST-object(s) for the new MDT-object."
1591 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1592 echo "MDT-object before confirm the multiple referenced case."
1595 check_mount_and_prep
1596 $LFS mkdir -i 1 $DIR/$tdir/a1
1597 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1598 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1599 cancel_lru_locks osc
1601 echo "Inject failure stub on MDT1 to delay the migration"
1603 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1604 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1605 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1606 $LFS mv -M 0 $DIR/$tdir/a1 &
1609 echo "Trigger layout LFSCK to race with the migration"
1610 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1612 for k in $(seq $MDSCOUNT); do
1613 # The LFSCK status query internal is 30 seconds. For the case
1614 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1615 # time to guarantee the status sync up.
1616 wait_update_facet mds${k} "$LCTL get_param -n \
1617 mdd.$(facet_svc mds${k}).lfsck_layout |
1618 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1619 error "(2) MDS${k} is not the expected 'completed'"
1622 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1623 local repaired=$($SHOW_LAYOUT |
1624 awk '/^repaired_unmatched_pair/ { print $2 }')
1625 [ $repaired -eq 1 ] ||
1626 error "(3) Fail to repair unmatched pair: $repaired"
1628 repaired=$($SHOW_LAYOUT |
1629 awk '/^repaired_multiple_referenced/ { print $2 }')
1630 [ $repaired -eq 0 ] ||
1631 error "(4) Unexpectedly repaird multiple references: $repaired"
1633 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1637 echo "If the OST-object's owner information does not match the owner"
1638 echo "information stored in the MDT-object, then the LFSCK trust the"
1639 echo "MDT-object and update the OST-object's owner information."
1642 check_mount_and_prep
1643 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1644 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1645 cancel_lru_locks osc
1647 echo "Inject failure stub to skip OST-object owner changing"
1648 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1649 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1650 chown 1.1 $DIR/$tdir/f0
1651 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1653 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1656 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1658 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1659 mdd.${MDT_DEV}.lfsck_layout |
1660 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1662 error "(2) unexpected status"
1665 local repaired=$($SHOW_LAYOUT |
1666 awk '/^repaired_inconsistent_owner/ { print $2 }')
1667 [ $repaired -eq 1 ] ||
1668 error "(3) Fail to repair inconsistent owner: $repaired"
1670 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1674 echo "If more than one MDT-objects reference the same OST-object,"
1675 echo "and the OST-object only recognizes one MDT-object, then the"
1676 echo "LFSCK should create new OST-objects for such non-recognized"
1680 check_mount_and_prep
1681 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1683 echo "Inject failure stub to make two MDT-objects to refernce"
1684 echo "the OST-object"
1686 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1687 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1689 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1690 cancel_lru_locks osc
1692 createmany -o $DIR/$tdir/f 1
1694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1696 cancel_lru_locks mdc
1697 cancel_lru_locks osc
1699 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1700 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1701 [ $size -eq 1048576 ] ||
1702 error "(1) f0 (wrong) size should be 1048576, but got $size"
1704 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1707 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1709 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1710 mdd.${MDT_DEV}.lfsck_layout |
1711 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1713 error "(3) unexpected status"
1716 local repaired=$($SHOW_LAYOUT |
1717 awk '/^repaired_multiple_referenced/ { print $2 }')
1718 [ $repaired -eq 1 ] ||
1719 error "(4) Fail to repair multiple references: $repaired"
1721 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1722 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1723 error "(5) Fail to write f0."
1724 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1725 [ $size -eq 1048576 ] ||
1726 error "(6) guard size should be 1048576, but got $size"
1728 run_test 17 "LFSCK can repair multiple references"
1730 $LCTL set_param debug=+cache > /dev/null
1734 echo "The target MDT-object is there, but related stripe information"
1735 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1736 echo "layout EA entries."
1739 check_mount_and_prep
1740 $LFS mkdir -i 0 $DIR/$tdir/a1
1741 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1742 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1744 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1746 $LFS path2fid $DIR/$tdir/a1/f1
1747 $LFS getstripe $DIR/$tdir/a1/f1
1749 if [ $MDSCOUNT -ge 2 ]; then
1750 $LFS mkdir -i 1 $DIR/$tdir/a2
1751 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1752 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1753 $LFS path2fid $DIR/$tdir/a2/f2
1754 $LFS getstripe $DIR/$tdir/a2/f2
1757 cancel_lru_locks osc
1759 echo "Inject failure, to make the MDT-object lost its layout EA"
1760 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1761 do_facet mds1 $LCTL set_param fail_loc=0x1615
1762 chown 1.1 $DIR/$tdir/a1/f1
1764 if [ $MDSCOUNT -ge 2 ]; then
1765 do_facet mds2 $LCTL set_param fail_loc=0x1615
1766 chown 1.1 $DIR/$tdir/a2/f2
1772 do_facet mds1 $LCTL set_param fail_loc=0
1773 if [ $MDSCOUNT -ge 2 ]; then
1774 do_facet mds2 $LCTL set_param fail_loc=0
1777 cancel_lru_locks mdc
1778 cancel_lru_locks osc
1780 echo "The file size should be incorrect since layout EA is lost"
1781 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1782 [ "$cur_size" != "$saved_size" ] ||
1783 error "(1) Expect incorrect file1 size"
1785 if [ $MDSCOUNT -ge 2 ]; then
1786 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1787 [ "$cur_size" != "$saved_size" ] ||
1788 error "(2) Expect incorrect file2 size"
1791 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1792 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1794 for k in $(seq $MDSCOUNT); do
1795 # The LFSCK status query internal is 30 seconds. For the case
1796 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1797 # time to guarantee the status sync up.
1798 wait_update_facet mds${k} "$LCTL get_param -n \
1799 mdd.$(facet_svc mds${k}).lfsck_layout |
1800 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1801 error "(4) MDS${k} is not the expected 'completed'"
1804 for k in $(seq $OSTCOUNT); do
1805 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1806 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1807 awk '/^status/ { print $2 }')
1808 [ "$cur_status" == "completed" ] ||
1809 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1812 local repaired=$(do_facet mds1 $LCTL get_param -n \
1813 mdd.$(facet_svc mds1).lfsck_layout |
1814 awk '/^repaired_orphan/ { print $2 }')
1815 [ $repaired -eq 1 ] ||
1816 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1818 if [ $MDSCOUNT -ge 2 ]; then
1819 repaired=$(do_facet mds2 $LCTL get_param -n \
1820 mdd.$(facet_svc mds2).lfsck_layout |
1821 awk '/^repaired_orphan/ { print $2 }')
1822 [ $repaired -eq 2 ] ||
1823 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1826 $LFS path2fid $DIR/$tdir/a1/f1
1827 $LFS getstripe $DIR/$tdir/a1/f1
1829 if [ $MDSCOUNT -ge 2 ]; then
1830 $LFS path2fid $DIR/$tdir/a2/f2
1831 $LFS getstripe $DIR/$tdir/a2/f2
1834 echo "The file size should be correct after layout LFSCK scanning"
1835 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1836 [ "$cur_size" == "$saved_size" ] ||
1837 error "(7) Expect file1 size $saved_size, but got $cur_size"
1839 if [ $MDSCOUNT -ge 2 ]; then
1840 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1841 [ "$cur_size" == "$saved_size" ] ||
1842 error "(8) Expect file2 size $saved_size, but got $cur_size"
1845 run_test 18a "Find out orphan OST-object and repair it (1)"
1849 echo "The target MDT-object is lost. The LFSCK should re-create the"
1850 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1851 echo "can move it back to normal namespace manually."
1854 check_mount_and_prep
1855 $LFS mkdir -i 0 $DIR/$tdir/a1
1856 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1857 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1858 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1859 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1861 $LFS getstripe $DIR/$tdir/a1/f1
1863 if [ $MDSCOUNT -ge 2 ]; then
1864 $LFS mkdir -i 1 $DIR/$tdir/a2
1865 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1866 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1867 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1869 $LFS getstripe $DIR/$tdir/a2/f2
1872 cancel_lru_locks osc
1874 echo "Inject failure, to simulate the case of missing the MDT-object"
1875 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1876 do_facet mds1 $LCTL set_param fail_loc=0x1616
1877 rm -f $DIR/$tdir/a1/f1
1879 if [ $MDSCOUNT -ge 2 ]; then
1880 do_facet mds2 $LCTL set_param fail_loc=0x1616
1881 rm -f $DIR/$tdir/a2/f2
1887 do_facet mds1 $LCTL set_param fail_loc=0
1888 if [ $MDSCOUNT -ge 2 ]; then
1889 do_facet mds2 $LCTL set_param fail_loc=0
1892 cancel_lru_locks mdc
1893 cancel_lru_locks osc
1895 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1896 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1898 for k in $(seq $MDSCOUNT); do
1899 # The LFSCK status query internal is 30 seconds. For the case
1900 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1901 # time to guarantee the status sync up.
1902 wait_update_facet mds${k} "$LCTL get_param -n \
1903 mdd.$(facet_svc mds${k}).lfsck_layout |
1904 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1905 error "(2) MDS${k} is not the expected 'completed'"
1908 for k in $(seq $OSTCOUNT); do
1909 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1910 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1911 awk '/^status/ { print $2 }')
1912 [ "$cur_status" == "completed" ] ||
1913 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1916 local repaired=$(do_facet mds1 $LCTL get_param -n \
1917 mdd.$(facet_svc mds1).lfsck_layout |
1918 awk '/^repaired_orphan/ { print $2 }')
1919 [ $repaired -eq 1 ] ||
1920 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1922 if [ $MDSCOUNT -ge 2 ]; then
1923 repaired=$(do_facet mds2 $LCTL get_param -n \
1924 mdd.$(facet_svc mds2).lfsck_layout |
1925 awk '/^repaired_orphan/ { print $2 }')
1926 [ $repaired -eq 2 ] ||
1927 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1930 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1931 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1932 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1934 if [ $MDSCOUNT -ge 2 ]; then
1935 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1936 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1939 $LFS path2fid $DIR/$tdir/a1/f1
1940 $LFS getstripe $DIR/$tdir/a1/f1
1942 if [ $MDSCOUNT -ge 2 ]; then
1943 $LFS path2fid $DIR/$tdir/a2/f2
1944 $LFS getstripe $DIR/$tdir/a2/f2
1947 echo "The file size should be correct after layout LFSCK scanning"
1948 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1949 [ "$cur_size" == "$saved_size" ] ||
1950 error "(7) Expect file1 size $saved_size, but got $cur_size"
1952 if [ $MDSCOUNT -ge 2 ]; then
1953 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1954 [ "$cur_size" == "$saved_size" ] ||
1955 error "(8) Expect file2 size $saved_size, but got $cur_size"
1958 run_test 18b "Find out orphan OST-object and repair it (2)"
1962 echo "The target MDT-object is lost, and the OST-object FID is missing."
1963 echo "The LFSCK should re-create the MDT-object with new FID under the "
1964 echo "directory .lustre/lost+found/MDTxxxx."
1967 check_mount_and_prep
1968 $LFS mkdir -i 0 $DIR/$tdir/a1
1969 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1971 echo "Inject failure, to simulate the case of missing parent FID"
1972 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1973 do_facet ost1 $LCTL set_param fail_loc=0x1617
1975 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1976 $LFS getstripe $DIR/$tdir/a1/f1
1978 if [ $MDSCOUNT -ge 2 ]; then
1979 $LFS mkdir -i 1 $DIR/$tdir/a2
1980 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1981 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1982 $LFS getstripe $DIR/$tdir/a2/f2
1985 cancel_lru_locks osc
1987 echo "Inject failure, to simulate the case of missing the MDT-object"
1988 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1989 do_facet mds1 $LCTL set_param fail_loc=0x1616
1990 rm -f $DIR/$tdir/a1/f1
1992 if [ $MDSCOUNT -ge 2 ]; then
1993 do_facet mds2 $LCTL set_param fail_loc=0x1616
1994 rm -f $DIR/$tdir/a2/f2
2000 do_facet mds1 $LCTL set_param fail_loc=0
2001 if [ $MDSCOUNT -ge 2 ]; then
2002 do_facet mds2 $LCTL set_param fail_loc=0
2005 cancel_lru_locks mdc
2006 cancel_lru_locks osc
2008 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2009 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2011 for k in $(seq $MDSCOUNT); do
2012 # The LFSCK status query internal is 30 seconds. For the case
2013 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2014 # time to guarantee the status sync up.
2015 wait_update_facet mds${k} "$LCTL get_param -n \
2016 mdd.$(facet_svc mds${k}).lfsck_layout |
2017 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2018 error "(2) MDS${k} is not the expected 'completed'"
2021 for k in $(seq $OSTCOUNT); do
2022 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2023 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2024 awk '/^status/ { print $2 }')
2025 [ "$cur_status" == "completed" ] ||
2026 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2029 if [ $MDSCOUNT -ge 2 ]; then
2035 local repaired=$(do_facet mds1 $LCTL get_param -n \
2036 mdd.$(facet_svc mds1).lfsck_layout |
2037 awk '/^repaired_orphan/ { print $2 }')
2038 [ $repaired -eq $expected ] ||
2039 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2041 if [ $MDSCOUNT -ge 2 ]; then
2042 repaired=$(do_facet mds2 $LCTL get_param -n \
2043 mdd.$(facet_svc mds2).lfsck_layout |
2044 awk '/^repaired_orphan/ { print $2 }')
2045 [ $repaired -eq 0 ] ||
2046 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2049 ls -ail $MOUNT/.lustre/lost+found/
2051 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2052 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2053 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2055 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2058 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2059 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2060 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2062 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2063 [ ! -z "$cname" ] ||
2064 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2066 run_test 18c "Find out orphan OST-object and repair it (3)"
2070 echo "The target MDT-object layout EA slot is occpuied by some new"
2071 echo "created OST-object when repair dangling reference case. Such"
2072 echo "conflict OST-object has never been modified. Then when found"
2073 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2077 check_mount_and_prep
2079 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2080 echo "guard" > $DIR/$tdir/a1/f1
2081 echo "foo" > $DIR/$tdir/a1/f2
2082 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2083 $LFS path2fid $DIR/$tdir/a1/f1
2084 $LFS getstripe $DIR/$tdir/a1/f1
2085 $LFS path2fid $DIR/$tdir/a1/f2
2086 $LFS getstripe $DIR/$tdir/a1/f2
2087 cancel_lru_locks osc
2089 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2090 echo "to reference the same OST-object (which is f1's OST-obejct)."
2091 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2092 echo "dangling reference case, but f2's old OST-object is there."
2095 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2096 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2097 chown 1.1 $DIR/$tdir/a1/f2
2098 rm -f $DIR/$tdir/a1/f1
2101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2103 echo "stopall to cleanup object cache"
2106 setupall > /dev/null
2108 echo "The file size should be incorrect since dangling referenced"
2109 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2110 [ "$cur_size" != "$saved_size" ] ||
2111 error "(1) Expect incorrect file2 size"
2113 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2114 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2116 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2117 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2119 wait_update_facet mds1 "$LCTL get_param -n \
2120 mdd.$(facet_svc mds1).lfsck_layout |
2121 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2122 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2124 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2126 for k in $(seq $MDSCOUNT); do
2127 # The LFSCK status query internal is 30 seconds. For the case
2128 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2129 # time to guarantee the status sync up.
2130 wait_update_facet mds${k} "$LCTL get_param -n \
2131 mdd.$(facet_svc mds${k}).lfsck_layout |
2132 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2133 error "(3) MDS${k} is not the expected 'completed'"
2136 for k in $(seq $OSTCOUNT); do
2137 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2138 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2139 awk '/^status/ { print $2 }')
2140 [ "$cur_status" == "completed" ] ||
2141 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2144 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2145 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2146 awk '/^repaired_orphan/ { print $2 }')
2147 [ $repaired -eq 1 ] ||
2148 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2150 echo "The file size should be correct after layout LFSCK scanning"
2151 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2152 [ "$cur_size" == "$saved_size" ] ||
2153 error "(6) Expect file2 size $saved_size, but got $cur_size"
2155 echo "The LFSCK should find back the original data."
2156 cat $DIR/$tdir/a1/f2
2157 $LFS path2fid $DIR/$tdir/a1/f2
2158 $LFS getstripe $DIR/$tdir/a1/f2
2160 run_test 18d "Find out orphan OST-object and repair it (4)"
2164 echo "The target MDT-object layout EA slot is occpuied by some new"
2165 echo "created OST-object when repair dangling reference case. Such"
2166 echo "conflict OST-object has been modified by others. To keep the"
2167 echo "new data, the LFSCK will create a new file to refernece this"
2168 echo "old orphan OST-object."
2171 check_mount_and_prep
2173 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2174 echo "guard" > $DIR/$tdir/a1/f1
2175 echo "foo" > $DIR/$tdir/a1/f2
2176 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2177 $LFS path2fid $DIR/$tdir/a1/f1
2178 $LFS getstripe $DIR/$tdir/a1/f1
2179 $LFS path2fid $DIR/$tdir/a1/f2
2180 $LFS getstripe $DIR/$tdir/a1/f2
2181 cancel_lru_locks osc
2183 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2184 echo "to reference the same OST-object (which is f1's OST-obejct)."
2185 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2186 echo "dangling reference case, but f2's old OST-object is there."
2189 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2191 chown 1.1 $DIR/$tdir/a1/f2
2192 rm -f $DIR/$tdir/a1/f1
2195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2197 echo "stopall to cleanup object cache"
2200 setupall > /dev/null
2202 echo "The file size should be incorrect since dangling referenced"
2203 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2204 [ "$cur_size" != "$saved_size" ] ||
2205 error "(1) Expect incorrect file2 size"
2207 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2208 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2210 start_full_debug_logging
2212 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2213 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2215 wait_update_facet mds1 "$LCTL get_param -n \
2216 mdd.$(facet_svc mds1).lfsck_layout |
2217 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2218 error "(3) MDS1 is not the expected 'scanning-phase2'"
2220 # to guarantee all updates are synced.
2224 echo "Write new data to f2 to modify the new created OST-object."
2225 echo "dummy" >> $DIR/$tdir/a1/f2
2227 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2229 for k in $(seq $MDSCOUNT); do
2230 # The LFSCK status query internal is 30 seconds. For the case
2231 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2232 # time to guarantee the status sync up.
2233 wait_update_facet mds${k} "$LCTL get_param -n \
2234 mdd.$(facet_svc mds${k}).lfsck_layout |
2235 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2236 error "(4) MDS${k} is not the expected 'completed'"
2239 for k in $(seq $OSTCOUNT); do
2240 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2241 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2242 awk '/^status/ { print $2 }')
2243 [ "$cur_status" == "completed" ] ||
2244 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2247 stop_full_debug_logging
2249 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2250 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2251 awk '/^repaired_orphan/ { print $2 }')
2252 [ $repaired -eq 1 ] ||
2253 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2255 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2256 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2257 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2259 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2260 [ ! -z "$cname" ] ||
2261 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2263 echo "The stub file should keep the original f2 data"
2264 cur_size=$(ls -il $cname | awk '{ print $6 }')
2265 [ "$cur_size" == "$saved_size" ] ||
2266 error "(9) Expect file2 size $saved_size, but got $cur_size"
2269 $LFS path2fid $cname
2270 $LFS getstripe $cname
2272 echo "The f2 should contains new data."
2273 cat $DIR/$tdir/a1/f2
2274 $LFS path2fid $DIR/$tdir/a1/f2
2275 $LFS getstripe $DIR/$tdir/a1/f2
2277 run_test 18e "Find out orphan OST-object and repair it (5)"
2280 [ $OSTCOUNT -lt 2 ] &&
2281 skip "The test needs at least 2 OSTs" && return
2284 echo "The target MDT-object is lost. The LFSCK should re-create the"
2285 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2286 echo "to verify some OST-object(s) during the first stage-scanning,"
2287 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2288 echo "should not be affected."
2291 check_mount_and_prep
2292 $LFS mkdir -i 0 $DIR/$tdir/a1
2293 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2294 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2295 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2296 $LFS mkdir -i 0 $DIR/$tdir/a2
2297 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2298 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2299 $LFS getstripe $DIR/$tdir/a1/f1
2300 $LFS getstripe $DIR/$tdir/a2/f2
2302 if [ $MDSCOUNT -ge 2 ]; then
2303 $LFS mkdir -i 1 $DIR/$tdir/a3
2304 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2305 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2306 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2307 $LFS mkdir -i 1 $DIR/$tdir/a4
2308 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2309 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2310 $LFS getstripe $DIR/$tdir/a3/f3
2311 $LFS getstripe $DIR/$tdir/a4/f4
2314 cancel_lru_locks osc
2316 echo "Inject failure, to simulate the case of missing the MDT-object"
2317 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2318 do_facet mds1 $LCTL set_param fail_loc=0x1616
2319 rm -f $DIR/$tdir/a1/f1
2320 rm -f $DIR/$tdir/a2/f2
2322 if [ $MDSCOUNT -ge 2 ]; then
2323 do_facet mds2 $LCTL set_param fail_loc=0x1616
2324 rm -f $DIR/$tdir/a3/f3
2325 rm -f $DIR/$tdir/a4/f4
2331 do_facet mds1 $LCTL set_param fail_loc=0
2332 if [ $MDSCOUNT -ge 2 ]; then
2333 do_facet mds2 $LCTL set_param fail_loc=0
2336 cancel_lru_locks mdc
2337 cancel_lru_locks osc
2339 echo "Inject failure, to simulate the OST0 fail to handle"
2340 echo "MDT0 LFSCK request during the first-stage scanning."
2341 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2342 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2344 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2345 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2347 for k in $(seq $MDSCOUNT); do
2348 # The LFSCK status query internal is 30 seconds. For the case
2349 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2350 # time to guarantee the status sync up.
2351 wait_update_facet mds${k} "$LCTL get_param -n \
2352 mdd.$(facet_svc mds${k}).lfsck_layout |
2353 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2354 error "(2) MDS${k} is not the expected 'partial'"
2357 wait_update_facet ost1 "$LCTL get_param -n \
2358 obdfilter.$(facet_svc ost1).lfsck_layout |
2359 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2360 error "(3) OST1 is not the expected 'partial'"
2363 wait_update_facet ost2 "$LCTL get_param -n \
2364 obdfilter.$(facet_svc ost2).lfsck_layout |
2365 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2366 error "(4) OST2 is not the expected 'completed'"
2369 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2371 local repaired=$(do_facet mds1 $LCTL get_param -n \
2372 mdd.$(facet_svc mds1).lfsck_layout |
2373 awk '/^repaired_orphan/ { print $2 }')
2374 [ $repaired -eq 1 ] ||
2375 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2377 if [ $MDSCOUNT -ge 2 ]; then
2378 repaired=$(do_facet mds2 $LCTL get_param -n \
2379 mdd.$(facet_svc mds2).lfsck_layout |
2380 awk '/^repaired_orphan/ { print $2 }')
2381 [ $repaired -eq 1 ] ||
2382 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2385 echo "Trigger layout LFSCK on all devices again to cleanup"
2386 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2388 for k in $(seq $MDSCOUNT); do
2389 # The LFSCK status query internal is 30 seconds. For the case
2390 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2391 # time to guarantee the status sync up.
2392 wait_update_facet mds${k} "$LCTL get_param -n \
2393 mdd.$(facet_svc mds${k}).lfsck_layout |
2394 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2395 error "(8) MDS${k} is not the expected 'completed'"
2398 for k in $(seq $OSTCOUNT); do
2399 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2400 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2401 awk '/^status/ { print $2 }')
2402 [ "$cur_status" == "completed" ] ||
2403 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2407 local repaired=$(do_facet mds1 $LCTL get_param -n \
2408 mdd.$(facet_svc mds1).lfsck_layout |
2409 awk '/^repaired_orphan/ { print $2 }')
2410 [ $repaired -eq 2 ] ||
2411 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2413 if [ $MDSCOUNT -ge 2 ]; then
2414 repaired=$(do_facet mds2 $LCTL get_param -n \
2415 mdd.$(facet_svc mds2).lfsck_layout |
2416 awk '/^repaired_orphan/ { print $2 }')
2417 [ $repaired -eq 2 ] ||
2418 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2421 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2423 $LCTL set_param debug=-cache > /dev/null
2426 check_mount_and_prep
2427 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2429 echo "foo" > $DIR/$tdir/a0
2430 echo "guard" > $DIR/$tdir/a1
2431 cancel_lru_locks osc
2433 echo "Inject failure, then client will offer wrong parent FID when read"
2434 do_facet ost1 $LCTL set_param -n \
2435 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2436 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2437 $LCTL set_param fail_loc=0x1619
2439 echo "Read RPC with wrong parent FID should be denied"
2440 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2441 $LCTL set_param fail_loc=0
2443 run_test 19a "OST-object inconsistency self detect"
2446 check_mount_and_prep
2447 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2449 echo "Inject failure stub to make the OST-object to back point to"
2450 echo "non-exist MDT-object"
2452 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2453 do_facet ost1 $LCTL set_param fail_loc=0x1611
2454 echo "foo" > $DIR/$tdir/f0
2455 cancel_lru_locks osc
2456 do_facet ost1 $LCTL set_param fail_loc=0
2458 echo "Nothing should be fixed since self detect and repair is disabled"
2459 local repaired=$(do_facet ost1 $LCTL get_param -n \
2460 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2461 awk '/^repaired/ { print $2 }')
2462 [ $repaired -eq 0 ] ||
2463 error "(1) Expected 0 repaired, but got $repaired"
2465 echo "Read RPC with right parent FID should be accepted,"
2466 echo "and cause parent FID on OST to be fixed"
2468 do_facet ost1 $LCTL set_param -n \
2469 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2470 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2472 repaired=$(do_facet ost1 $LCTL get_param -n \
2473 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2474 awk '/^repaired/ { print $2 }')
2475 [ $repaired -eq 1 ] ||
2476 error "(3) Expected 1 repaired, but got $repaired"
2478 run_test 19b "OST-object inconsistency self repair"
2481 [ $OSTCOUNT -lt 2 ] &&
2482 skip "The test needs at least 2 OSTs" && return
2485 echo "The target MDT-object and some of its OST-object are lost."
2486 echo "The LFSCK should find out the left OST-objects and re-create"
2487 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2488 echo "with the partial OST-objects (LOV EA hole)."
2490 echo "New client can access the file with LOV EA hole via normal"
2491 echo "system tools or commands without crash the system."
2493 echo "For old client, even though it cannot access the file with"
2494 echo "LOV EA hole, it should not cause the system crash."
2497 check_mount_and_prep
2498 $LFS mkdir -i 0 $DIR/$tdir/a1
2499 if [ $OSTCOUNT -gt 2 ]; then
2500 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2503 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2507 # 256 blocks on the stripe0.
2508 # 1 block on the stripe1 for 2 OSTs case.
2509 # 256 blocks on the stripe1 for other cases.
2510 # 1 block on the stripe2 if OSTs > 2
2511 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2512 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2513 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2515 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2516 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2517 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2520 $LFS getstripe $DIR/$tdir/a1/f0
2522 $LFS getstripe $DIR/$tdir/a1/f1
2524 $LFS getstripe $DIR/$tdir/a1/f2
2526 if [ $OSTCOUNT -gt 2 ]; then
2527 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2528 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2530 $LFS getstripe $DIR/$tdir/a1/f3
2533 cancel_lru_locks osc
2535 echo "Inject failure..."
2536 echo "To simulate f0 lost MDT-object"
2537 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2538 do_facet mds1 $LCTL set_param fail_loc=0x1616
2539 rm -f $DIR/$tdir/a1/f0
2541 echo "To simulate f1 lost MDT-object and OST-object0"
2542 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2543 do_facet mds1 $LCTL set_param fail_loc=0x161a
2544 rm -f $DIR/$tdir/a1/f1
2546 echo "To simulate f2 lost MDT-object and OST-object1"
2547 do_facet mds1 $LCTL set_param fail_val=1
2548 rm -f $DIR/$tdir/a1/f2
2550 if [ $OSTCOUNT -gt 2 ]; then
2551 echo "To simulate f3 lost MDT-object and OST-object2"
2552 do_facet mds1 $LCTL set_param fail_val=2
2553 rm -f $DIR/$tdir/a1/f3
2556 umount_client $MOUNT
2559 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2561 echo "Inject failure to slow down the LFSCK on OST0"
2562 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2563 do_facet ost1 $LCTL set_param fail_loc=0x161b
2565 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2566 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2569 do_facet ost1 $LCTL set_param fail_loc=0
2571 for k in $(seq $MDSCOUNT); do
2572 # The LFSCK status query internal is 30 seconds. For the case
2573 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2574 # time to guarantee the status sync up.
2575 wait_update_facet mds${k} "$LCTL get_param -n \
2576 mdd.$(facet_svc mds${k}).lfsck_layout |
2577 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2578 error "(2) MDS${k} is not the expected 'completed'"
2581 for k in $(seq $OSTCOUNT); do
2582 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2583 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2584 awk '/^status/ { print $2 }')
2585 [ "$cur_status" == "completed" ] ||
2586 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2589 local repaired=$(do_facet mds1 $LCTL get_param -n \
2590 mdd.$(facet_svc mds1).lfsck_layout |
2591 awk '/^repaired_orphan/ { print $2 }')
2592 if [ $OSTCOUNT -gt 2 ]; then
2593 [ $repaired -eq 9 ] ||
2594 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2596 [ $repaired -eq 4 ] ||
2597 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2600 mount_client $MOUNT || error "(5.0) Fail to start client!"
2602 LOV_PATTERN_F_HOLE=0x40000000
2605 # ${fid0}-R-0 is the old f0
2607 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2608 echo "Check $name, which is the old f0"
2610 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2612 local pattern=0x$($LFS getstripe -L $name)
2613 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2614 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2616 local stripes=$($LFS getstripe -c $name)
2617 if [ $OSTCOUNT -gt 2 ]; then
2618 [ $stripes -eq 3 ] ||
2619 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2621 [ $stripes -eq 2 ] ||
2622 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2625 local size=$(stat $name | awk '/Size:/ { print $2 }')
2626 [ $size -eq $((4096 * $bcount)) ] ||
2627 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2629 cat $name > /dev/null || error "(5.5) cannot read $name"
2631 echo "dummy" >> $name || error "(5.6) cannot write $name"
2633 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2635 touch $name || error "(5.8) cannot touch $name"
2637 rm -f $name || error "(5.9) cannot unlink $name"
2640 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2642 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2643 if [ $OSTCOUNT -gt 2 ]; then
2644 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2646 echo "Check $name, it contains the old f1's stripe1"
2649 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2651 pattern=0x$($LFS getstripe -L $name)
2652 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2653 error "(6.2) expect pattern flag hole, but got $pattern"
2655 stripes=$($LFS getstripe -c $name)
2656 if [ $OSTCOUNT -gt 2 ]; then
2657 [ $stripes -eq 3 ] ||
2658 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2660 [ $stripes -eq 2 ] ||
2661 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2664 size=$(stat $name | awk '/Size:/ { print $2 }')
2665 [ $size -eq $((4096 * $bcount)) ] ||
2666 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2668 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2670 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2671 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2674 [ $failures -eq 256 ] ||
2675 error "(6.6) expect 256 IO failures, but get $failures"
2677 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2678 [ $size -eq $((4096 * $bcount)) ] ||
2679 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2681 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2682 error "(6.8) write to the LOV EA hole should fail"
2684 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2685 error "(6.9) write to normal stripe should NOT fail"
2687 echo "foo" >> $name && error "(6.10) append write $name should fail"
2689 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2691 touch $name || error "(6.12) cannot touch $name"
2693 rm -f $name || error "(6.13) cannot unlink $name"
2696 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2698 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2699 if [ $OSTCOUNT -gt 2 ]; then
2700 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2702 echo "Check $name, it contains the old f2's stripe0"
2705 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2707 pattern=0x$($LFS getstripe -L $name)
2708 stripes=$($LFS getstripe -c $name)
2709 size=$(stat $name | awk '/Size:/ { print $2 }')
2710 if [ $OSTCOUNT -gt 2 ]; then
2711 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2712 error "(7.2.1) expect pattern flag hole, but got $pattern"
2714 [ $stripes -eq 3 ] ||
2715 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2717 [ $size -eq $((4096 * $bcount)) ] ||
2718 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2720 cat $name > /dev/null &&
2721 error "(7.5.1) normal read $name should fail"
2723 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2724 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2726 [ $failures -eq 256 ] ||
2727 error "(7.6) expect 256 IO failures, but get $failures"
2729 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2730 [ $size -eq $((4096 * $bcount)) ] ||
2731 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2733 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2734 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2736 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2737 error "(7.8.1) write to normal stripe should NOT fail"
2739 echo "foo" >> $name &&
2740 error "(7.8.3) append write $name should fail"
2742 chown $RUNAS_ID:$RUNAS_GID $name ||
2743 error "(7.9.1) cannot chown on $name"
2745 touch $name || error "(7.10.1) cannot touch $name"
2747 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2748 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2750 [ $stripes -eq 1 ] ||
2751 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2754 [ $size -eq $((4096 * (256 + 0))) ] ||
2755 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2757 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2759 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2761 chown $RUNAS_ID:$RUNAS_GID $name ||
2762 error "(7.9.2) cannot chown on $name"
2764 touch $name || error "(7.10.2) cannot touch $name"
2767 rm -f $name || error "(7.11) cannot unlink $name"
2769 [ $OSTCOUNT -le 2 ] && return
2772 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2774 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2775 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2777 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2779 pattern=0x$($LFS getstripe -L $name)
2780 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2781 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2783 stripes=$($LFS getstripe -c $name)
2784 # LFSCK does not know the old f3 had 3 stripes.
2785 # It only tries to find as much as possible.
2786 # The stripe count depends on the last stripe's offset.
2787 [ $stripes -eq 2 ] ||
2788 error "(8.3) expect the stripe count is 2, but got $stripes"
2790 size=$(stat $name | awk '/Size:/ { print $2 }')
2792 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2793 error "(8.4) expect the size $((4096 * 512)), but got $size"
2795 cat $name > /dev/null || error "(8.5) cannot read $name"
2797 echo "dummy" >> $name || error "(8.6) cannot write $name"
2799 chown $RUNAS_ID:$RUNAS_GID $name ||
2800 error "(8.7) cannot chown on $name"
2802 touch $name || error "(8.8) cannot touch $name"
2804 rm -f $name || error "(8.9) cannot unlink $name"
2806 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2809 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2810 skip "ignore the test if MDS is older than 2.5.59" && return
2812 check_mount_and_prep
2813 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2815 echo "Start all LFSCK components by default (-s 1)"
2816 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2817 error "Fail to start LFSCK"
2819 echo "namespace LFSCK should be in 'scanning-phase1' status"
2820 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2821 [ "$STATUS" == "scanning-phase1" ] ||
2822 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2824 echo "layout LFSCK should be in 'scanning-phase1' status"
2825 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2826 [ "$STATUS" == "scanning-phase1" ] ||
2827 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2829 echo "Stop all LFSCK components by default"
2830 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2831 error "Fail to stop LFSCK"
2833 run_test 21 "run all LFSCK components by default"
2836 [ $MDSCOUNT -lt 2 ] &&
2837 skip "We need at least 2 MDSes for this test" && return
2840 echo "The parent_A references the child directory via some name entry,"
2841 echo "but the child directory back references another parent_B via its"
2842 echo "".." name entry. The parent_B does not exist. Then the namespace"
2843 echo "LFSCK will repair the child directory's ".." name entry."
2846 check_mount_and_prep
2848 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2849 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2851 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2852 echo "The dummy's dotdot name entry references the guard."
2853 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2855 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2856 error "(3) Fail to mkdir on MDT0"
2857 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2859 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2861 echo "Trigger namespace LFSCK to repair unmatched pairs"
2862 $START_NAMESPACE -A -r ||
2863 error "(5) Fail to start LFSCK for namespace"
2865 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2866 mdd.${MDT_DEV}.lfsck_namespace |
2867 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2869 error "(6) unexpected status"
2872 local repaired=$($SHOW_NAMESPACE |
2873 awk '/^unmatched_pairs_repaired/ { print $2 }')
2874 [ $repaired -eq 1 ] ||
2875 error "(7) Fail to repair unmatched pairs: $repaired"
2877 echo "'ls' should success after namespace LFSCK repairing"
2878 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2879 error "(8) ls should success."
2881 run_test 22a "LFSCK can repair unmatched pairs (1)"
2884 [ $MDSCOUNT -lt 2 ] &&
2885 skip "We need at least 2 MDSes for this test" && return
2888 echo "The parent_A references the child directory via the name entry_B,"
2889 echo "but the child directory back references another parent_C via its"
2890 echo "".." name entry. The parent_C exists, but there is no the name"
2891 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2892 echo "the child directory's ".." name entry and its linkEA."
2895 check_mount_and_prep
2897 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2898 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2900 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2901 echo "and bad linkEA. The dummy's dotdot name entry references the"
2902 echo "guard. The dummy's linkEA references n non-exist name entry."
2903 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2904 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2905 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2906 error "(3) Fail to mkdir on MDT0"
2907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2909 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2910 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2911 local dummyname=$($LFS fid2path $DIR $dummyfid)
2912 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2913 error "(4) fid2path works unexpectedly."
2915 echo "Trigger namespace LFSCK to repair unmatched pairs"
2916 $START_NAMESPACE -A -r ||
2917 error "(5) Fail to start LFSCK for namespace"
2919 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2920 mdd.${MDT_DEV}.lfsck_namespace |
2921 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2923 error "(6) unexpected status"
2926 local repaired=$($SHOW_NAMESPACE |
2927 awk '/^unmatched_pairs_repaired/ { print $2 }')
2928 [ $repaired -eq 1 ] ||
2929 error "(7) Fail to repair unmatched pairs: $repaired"
2931 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2932 local dummyname=$($LFS fid2path $DIR $dummyfid)
2933 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2934 error "(8) fid2path does not work"
2936 run_test 22b "LFSCK can repair unmatched pairs (2)"
2939 [ $MDSCOUNT -lt 2 ] &&
2940 skip "We need at least 2 MDSes for this test" && return
2943 echo "The name entry is there, but the MDT-object for such name "
2944 echo "entry does not exist. The namespace LFSCK should find out "
2945 echo "and repair the inconsistency as required."
2948 check_mount_and_prep
2950 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2951 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2953 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2954 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2955 do_facet mds2 $LCTL set_param fail_loc=0x1620
2956 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2957 do_facet mds2 $LCTL set_param fail_loc=0
2959 echo "'ls' should fail because of dangling name entry"
2960 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2962 echo "Trigger namespace LFSCK to find out dangling name entry"
2963 $START_NAMESPACE -A -r ||
2964 error "(5) Fail to start LFSCK for namespace"
2966 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2967 mdd.${MDT_DEV}.lfsck_namespace |
2968 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2970 error "(6) unexpected status"
2973 local repaired=$($SHOW_NAMESPACE |
2974 awk '/^dangling_repaired/ { print $2 }')
2975 [ $repaired -eq 1 ] ||
2976 error "(7) Fail to repair dangling name entry: $repaired"
2978 echo "'ls' should fail because not re-create MDT-object by default"
2979 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2981 echo "Trigger namespace LFSCK again to repair dangling name entry"
2982 $START_NAMESPACE -A -r -C ||
2983 error "(9) Fail to start LFSCK for namespace"
2985 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2986 mdd.${MDT_DEV}.lfsck_namespace |
2987 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2989 error "(10) unexpected status"
2992 repaired=$($SHOW_NAMESPACE |
2993 awk '/^dangling_repaired/ { print $2 }')
2994 [ $repaired -eq 1 ] ||
2995 error "(11) Fail to repair dangling name entry: $repaired"
2997 echo "'ls' should success after namespace LFSCK repairing"
2998 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3000 run_test 23a "LFSCK can repair dangling name entry (1)"
3004 echo "The objectA has multiple hard links, one of them corresponding"
3005 echo "to the name entry_B. But there is something wrong for the name"
3006 echo "entry_B and cause entry_B to references non-exist object_C."
3007 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3008 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3009 echo "comes to the second-stage scanning, it will find that the"
3010 echo "former re-creating object_C is not proper, and will try to"
3011 echo "replace the object_C with the real object_A."
3014 check_mount_and_prep
3016 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3017 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3018 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3020 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3021 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3023 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3024 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3026 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3028 echo "'ls' should fail because of dangling name entry"
3029 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3030 error "(6) ls should fail."
3032 echo "Trigger namespace LFSCK to find out dangling name entry"
3033 $START_NAMESPACE -r -C ||
3034 error "(7) Fail to start LFSCK for namespace"
3036 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3037 mdd.${MDT_DEV}.lfsck_namespace |
3038 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3040 error "(8) unexpected status"
3043 local repaired=$($SHOW_NAMESPACE |
3044 awk '/^dangling_repaired/ { print $2 }')
3045 [ $repaired -eq 1 ] ||
3046 error "(9) Fail to repair dangling name entry: $repaired"
3048 repaired=$($SHOW_NAMESPACE |
3049 awk '/^multiple_linked_repaired/ { print $2 }')
3050 [ $repaired -eq 1 ] ||
3051 error "(10) Fail to drop the former created object: $repaired"
3053 local data=$(cat $DIR/$tdir/d0/foo)
3054 [ "$data" == "dummy" ] ||
3055 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3057 run_test 23b "LFSCK can repair dangling name entry (2)"
3061 echo "The objectA has multiple hard links, one of them corresponding"
3062 echo "to the name entry_B. But there is something wrong for the name"
3063 echo "entry_B and cause entry_B to references non-exist object_C."
3064 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3065 echo "as dangling, and re-create the lost object_C. And then others"
3066 echo "modified the re-created object_C. When the LFSCK comes to the"
3067 echo "second-stage scanning, it will find that the former re-creating"
3068 echo "object_C maybe wrong and try to replace the object_C with the"
3069 echo "real object_A. But because object_C has been modified, so the"
3070 echo "LFSCK cannot replace it."
3073 check_mount_and_prep
3075 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3076 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3077 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3079 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3080 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3081 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3082 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3085 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3087 echo "'ls' should fail because of dangling name entry"
3088 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3089 error "(6) ls should fail."
3091 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3092 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3094 echo "Trigger namespace LFSCK to find out dangling name entry"
3095 $START_NAMESPACE -r -C ||
3096 error "(7) Fail to start LFSCK for namespace"
3098 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3099 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3100 stat $DIR/$tdir/guard
3102 error "(8) unexpected size"
3105 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3106 cancel_lru_locks osc
3108 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3109 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3110 mdd.${MDT_DEV}.lfsck_namespace |
3111 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3113 error "(10) unexpected status"
3116 local repaired=$($SHOW_NAMESPACE |
3117 awk '/^dangling_repaired/ { print $2 }')
3118 [ $repaired -eq 1 ] ||
3119 error "(11) Fail to repair dangling name entry: $repaired"
3121 local data=$(cat $DIR/$tdir/d0/foo)
3122 [ "$data" != "dummy" ] ||
3123 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3125 run_test 23c "LFSCK can repair dangling name entry (3)"
3128 [ $MDSCOUNT -lt 2 ] &&
3129 skip "We need at least 2 MDSes for this test" && return
3132 echo "Two MDT-objects back reference the same name entry via their"
3133 echo "each own linkEA entry, but the name entry only references one"
3134 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3135 echo "for the MDT-object that is not recognized. If such MDT-object"
3136 echo "has no other linkEA entry after the removing, then the LFSCK"
3137 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3140 check_mount_and_prep
3142 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3144 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3145 $LFS path2fid $DIR/$tdir/d0/guard
3147 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3148 $LFS path2fid $DIR/$tdir/d0/dummy
3151 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3152 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3154 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3157 touch $DIR/$tdir/d0/guard/foo ||
3158 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3160 echo "Inject failure stub on MDT0 to simulate the case that"
3161 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3162 echo "that references $DIR/$tdir/d0/guard/foo."
3163 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3164 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3165 echo "there with the same linkEA entry as another MDT-object"
3166 echo "$DIR/$tdir/d0/guard/foo has"
3168 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3170 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3171 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3172 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3173 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3174 rmdir $DIR/$tdir/d0/dummy/foo ||
3175 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3176 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3178 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3179 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3180 error "(6) stat successfully unexpectedly"
3182 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3183 $START_NAMESPACE -A -r ||
3184 error "(7) Fail to start LFSCK for namespace"
3186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3187 mdd.${MDT_DEV}.lfsck_namespace |
3188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3190 error "(8) unexpected status"
3193 local repaired=$($SHOW_NAMESPACE |
3194 awk '/^multiple_referenced_repaired/ { print $2 }')
3195 [ $repaired -eq 1 ] ||
3196 error "(9) Fail to repair multiple referenced name entry: $repaired"
3198 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3199 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3200 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3202 local cname="$cfid-$pfid-D-0"
3203 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3204 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3206 run_test 24 "LFSCK can repair multiple-referenced name entry"
3209 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3210 skip "Only support to inject failure on ldiskfs" && return
3213 echo "The file type in the name entry does not match the file type"
3214 echo "claimed by the referenced object. Then the LFSCK will update"
3215 echo "the file type in the name entry."
3218 check_mount_and_prep
3220 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3222 echo "Inject failure stub on MDT0 to simulate the case that"
3223 echo "the file type stored in the name entry is wrong."
3225 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3226 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3227 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3228 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3230 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3231 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3233 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3234 mdd.${MDT_DEV}.lfsck_namespace |
3235 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3237 error "(4) unexpected status"
3240 local repaired=$($SHOW_NAMESPACE |
3241 awk '/^bad_file_type_repaired/ { print $2 }')
3242 [ $repaired -eq 1 ] ||
3243 error "(5) Fail to repair bad file type in name entry: $repaired"
3245 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3247 run_test 25 "LFSCK can repair bad file type in the name entry"
3251 echo "The local name entry back referenced by the MDT-object is lost."
3252 echo "The namespace LFSCK will add the missing local name entry back"
3253 echo "to the normal namespace."
3256 check_mount_and_prep
3258 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3259 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3260 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3262 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3263 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3265 echo "Inject failure stub on MDT0 to simulate the case that"
3266 echo "foo's name entry will be removed, but the foo's object"
3267 echo "and its linkEA are kept in the system."
3269 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3271 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3272 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3274 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3276 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3277 $START_NAMESPACE -r -A ||
3278 error "(6) Fail to start LFSCK for namespace"
3280 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3281 mdd.${MDT_DEV}.lfsck_namespace |
3282 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3284 error "(7) unexpected status"
3287 local repaired=$($SHOW_NAMESPACE |
3288 awk '/^lost_dirent_repaired/ { print $2 }')
3289 [ $repaired -eq 1 ] ||
3290 error "(8) Fail to repair lost dirent: $repaired"
3292 ls -ail $DIR/$tdir/d0/foo ||
3293 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3295 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3296 [ "$foofid" == "$foofid2" ] ||
3297 error "(10) foo's FID changed: $foofid, $foofid2"
3299 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3302 [ $MDSCOUNT -lt 2 ] &&
3303 skip "We need at least 2 MDSes for this test" && return
3306 echo "The remote name entry back referenced by the MDT-object is lost."
3307 echo "The namespace LFSCK will add the missing remote name entry back"
3308 echo "to the normal namespace."
3311 check_mount_and_prep
3313 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3314 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3315 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3317 echo "Inject failure stub on MDT0 to simulate the case that"
3318 echo "foo's name entry will be removed, but the foo's object"
3319 echo "and its linkEA are kept in the system."
3321 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3322 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3323 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3326 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3328 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3329 $START_NAMESPACE -r -A ||
3330 error "(5) Fail to start LFSCK for namespace"
3332 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3333 mdd.${MDT_DEV}.lfsck_namespace |
3334 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3336 error "(6) unexpected status"
3339 local repaired=$($SHOW_NAMESPACE |
3340 awk '/^lost_dirent_repaired/ { print $2 }')
3341 [ $repaired -eq 1 ] ||
3342 error "(7) Fail to repair lost dirent: $repaired"
3344 ls -ail $DIR/$tdir/d0/foo ||
3345 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3347 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3348 [ "$foofid" == "$foofid2" ] ||
3349 error "(9) foo's FID changed: $foofid, $foofid2"
3351 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3355 echo "The local parent referenced by the MDT-object linkEA is lost."
3356 echo "The namespace LFSCK will re-create the lost parent as orphan."
3359 check_mount_and_prep
3361 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3362 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3363 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3364 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3366 echo "Inject failure stub on MDT0 to simulate the case that"
3367 echo "foo's name entry will be removed, but the foo's object"
3368 echo "and its linkEA are kept in the system. And then remove"
3369 echo "another hard link and the parent directory."
3371 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3372 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3373 rm -f $DIR/$tdir/d0/foo ||
3374 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3375 rm -f $DIR/$tdir/d0/dummy ||
3376 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3377 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3379 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3380 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3382 echo "Trigger namespace LFSCK to repair the lost parent"
3383 $START_NAMESPACE -r -A ||
3384 error "(6) Fail to start LFSCK for namespace"
3386 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3387 mdd.${MDT_DEV}.lfsck_namespace |
3388 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3390 error "(7) unexpected status"
3393 local repaired=$($SHOW_NAMESPACE |
3394 awk '/^lost_dirent_repaired/ { print $2 }')
3395 [ $repaired -eq 1 ] ||
3396 error "(8) Fail to repair lost dirent: $repaired"
3398 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3399 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3400 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3402 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3404 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3405 [ ! -z "$cname" ] ||
3406 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3408 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3411 [ $MDSCOUNT -lt 2 ] &&
3412 skip "We need at least 2 MDSes for this test" && return
3415 echo "The remote parent referenced by the MDT-object linkEA is lost."
3416 echo "The namespace LFSCK will re-create the lost parent as orphan."
3419 check_mount_and_prep
3421 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3422 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3424 $LFS path2fid $DIR/$tdir/d0
3426 echo "Inject failure stub on MDT0 to simulate the case that"
3427 echo "foo's name entry will be removed, but the foo's object"
3428 echo "and its linkEA are kept in the system. And then remove"
3429 echo "the parent directory."
3431 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3432 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3433 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3434 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3436 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3437 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3439 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3440 $START_NAMESPACE -r -A ||
3441 error "(6) Fail to start LFSCK for namespace"
3443 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3444 mdd.${MDT_DEV}.lfsck_namespace |
3445 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3447 error "(7) unexpected status"
3450 local repaired=$($SHOW_NAMESPACE |
3451 awk '/^lost_dirent_repaired/ { print $2 }')
3452 [ $repaired -eq 1 ] ||
3453 error "(8) Fail to repair lost dirent: $repaired"
3455 ls -ail $MOUNT/.lustre/lost+found/
3457 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3458 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3459 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3461 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3463 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3464 [ ! -z "$cname" ] ||
3465 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3467 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3470 [ $MDSCOUNT -lt 2 ] &&
3471 skip "The test needs at least 2 MDTs" && return
3474 echo "The target name entry is lost. The LFSCK should insert the"
3475 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3476 echo "the MDT (on which the orphan MDT-object resides) has ever"
3477 echo "failed to respond some name entry verification during the"
3478 echo "first stage-scanning, then the LFSCK should skip to handle"
3479 echo "orphan MDT-object on this MDT. But other MDTs should not"
3483 check_mount_and_prep
3484 $LFS mkdir -i 0 $DIR/$tdir/d1
3485 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3486 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3488 $LFS mkdir -i 1 $DIR/$tdir/d2
3489 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3490 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3492 echo "Inject failure stub on MDT0 to simulate the case that"
3493 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3494 echo "and its linkEA are kept in the system. And the case that"
3495 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3496 echo "and its linkEA are kept in the system."
3498 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3499 do_facet mds1 $LCTL set_param fail_loc=0x1624
3500 do_facet mds2 $LCTL set_param fail_loc=0x1624
3501 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3502 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3503 do_facet mds1 $LCTL set_param fail_loc=0
3504 do_facet mds2 $LCTL set_param fail_loc=0
3506 cancel_lru_locks mdc
3507 cancel_lru_locks osc
3509 echo "Inject failure, to simulate the MDT0 fail to handle"
3510 echo "MDT1 LFSCK request during the first-stage scanning."
3511 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3512 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3514 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3515 $START_NAMESPACE -r -A ||
3516 error "(3) Fail to start LFSCK for namespace"
3518 wait_update_facet mds1 "$LCTL get_param -n \
3519 mdd.$(facet_svc mds1).lfsck_namespace |
3520 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3521 error "(4) mds1 is not the expected 'partial'"
3524 wait_update_facet mds2 "$LCTL get_param -n \
3525 mdd.$(facet_svc mds2).lfsck_namespace |
3526 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3527 error "(5) mds2 is not the expected 'completed'"
3530 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3532 local repaired=$(do_facet mds1 $LCTL get_param -n \
3533 mdd.$(facet_svc mds1).lfsck_namespace |
3534 awk '/^lost_dirent_repaired/ { print $2 }')
3535 [ $repaired -eq 0 ] ||
3536 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3538 repaired=$(do_facet mds2 $LCTL get_param -n \
3539 mdd.$(facet_svc mds2).lfsck_namespace |
3540 awk '/^lost_dirent_repaired/ { print $2 }')
3541 [ $repaired -eq 1 ] ||
3542 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3544 echo "Trigger namespace LFSCK on all devices again to cleanup"
3545 $START_NAMESPACE -r -A ||
3546 error "(8) Fail to start LFSCK for namespace"
3548 for k in $(seq $MDSCOUNT); do
3549 # The LFSCK status query internal is 30 seconds. For the case
3550 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3551 # time to guarantee the status sync up.
3552 wait_update_facet mds${k} "$LCTL get_param -n \
3553 mdd.$(facet_svc mds${k}).lfsck_namespace |
3554 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3555 error "(9) MDS${k} is not the expected 'completed'"
3558 local repaired=$(do_facet mds1 $LCTL get_param -n \
3559 mdd.$(facet_svc mds1).lfsck_namespace |
3560 awk '/^lost_dirent_repaired/ { print $2 }')
3561 [ $repaired -eq 1 ] ||
3562 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3564 repaired=$(do_facet mds2 $LCTL get_param -n \
3565 mdd.$(facet_svc mds2).lfsck_namespace |
3566 awk '/^lost_dirent_repaired/ { print $2 }')
3567 [ $repaired -eq 0 ] ||
3568 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3570 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3574 echo "The object's nlink attribute is larger than the object's known"
3575 echo "name entries count. The LFSCK will repair the object's nlink"
3576 echo "attribute to match the known name entries count"
3579 check_mount_and_prep
3581 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3582 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3584 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3585 echo "nlink attribute is larger than its name entries count."
3587 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3588 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3589 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3590 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3593 cancel_lru_locks mdc
3594 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3595 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3597 echo "Trigger namespace LFSCK to repair the nlink count"
3598 $START_NAMESPACE -r -A ||
3599 error "(5) Fail to start LFSCK for namespace"
3601 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3602 mdd.${MDT_DEV}.lfsck_namespace |
3603 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3605 error "(6) unexpected status"
3608 local repaired=$($SHOW_NAMESPACE |
3609 awk '/^nlinks_repaired/ { print $2 }')
3610 [ $repaired -eq 1 ] ||
3611 error "(7) Fail to repair nlink count: $repaired"
3613 cancel_lru_locks mdc
3614 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3615 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3617 run_test 29a "LFSCK can repair bad nlink count (1)"
3621 echo "The object's nlink attribute is smaller than the object's known"
3622 echo "name entries count. The LFSCK will repair the object's nlink"
3623 echo "attribute to match the known name entries count"
3626 check_mount_and_prep
3628 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3629 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3631 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3632 echo "nlink attribute is smaller than its name entries count."
3634 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3635 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3636 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3637 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3638 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3640 cancel_lru_locks mdc
3641 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3642 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3644 echo "Trigger namespace LFSCK to repair the nlink count"
3645 $START_NAMESPACE -r -A ||
3646 error "(5) Fail to start LFSCK for namespace"
3648 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3649 mdd.${MDT_DEV}.lfsck_namespace |
3650 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3652 error "(6) unexpected status"
3655 local repaired=$($SHOW_NAMESPACE |
3656 awk '/^nlinks_repaired/ { print $2 }')
3657 [ $repaired -eq 1 ] ||
3658 error "(7) Fail to repair nlink count: $repaired"
3660 cancel_lru_locks mdc
3661 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3662 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3664 run_test 29b "LFSCK can repair bad nlink count (2)"
3668 echo "There are too many hard links to the object, and exceeds the"
3669 echo "object's linkEA limitation, as to NOT all the known name entries"
3670 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3671 echo "skip the nlink verification for this object."
3674 check_mount_and_prep
3676 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3677 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3678 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3679 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3681 echo "Inject failure stub on MDT0 to simulate the case that"
3682 echo "foo's hard links exceed the object's linkEA limitation."
3684 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3686 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3687 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3689 cancel_lru_locks mdc
3691 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3692 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3694 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3695 $LFS fid2path $DIR $foofid
3696 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3697 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3699 echo "Trigger namespace LFSCK to repair the nlink count"
3700 $START_NAMESPACE -r -A ||
3701 error "(7) Fail to start LFSCK for namespace"
3703 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3704 mdd.${MDT_DEV}.lfsck_namespace |
3705 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3707 error "(8) unexpected status"
3710 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3711 local repaired=$($SHOW_NAMESPACE |
3712 awk '/^nlinks_repaired/ { print $2 }')
3713 [ $repaired -eq 0 ] ||
3714 error "(9) Repair nlink count unexpcetedly: $repaired"
3716 cancel_lru_locks mdc
3718 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3719 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3721 count2=$($LFS fid2path $DIR $foofid | wc -l)
3722 [ $count2 -eq 2 ] ||
3723 error "(11) Repaired something unexpectedly: $count2"
3725 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3728 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3729 skip "Only support backend /lost+found for ldiskfs" && return
3732 echo "The namespace LFSCK will move the orphans from backend"
3733 echo "/lost+found directory to normal client visible namespace"
3734 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3737 check_mount_and_prep
3739 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3740 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3742 echo "Inject failure stub on MDT0 to simulate the case that"
3743 echo "directory d0 has no linkEA entry, then the LFSCK will"
3744 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3746 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3747 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3748 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3749 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3751 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3752 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3754 echo "Inject failure stub on MDT0 to simulate the case that the"
3755 echo "object's name entry will be removed, but not destroy the"
3756 echo "object. Then backend e2fsck will handle it as orphan and"
3757 echo "add them into the backend /lost+found directory."
3759 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3760 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3761 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3762 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3763 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3764 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3765 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3767 umount_client $MOUNT || error "(10) Fail to stop client!"
3769 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3772 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3773 error "(12) Fail to run e2fsck"
3775 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3776 error "(13) Fail to start MDT0"
3778 echo "Trigger namespace LFSCK to recover backend orphans"
3779 $START_NAMESPACE -r -A ||
3780 error "(14) Fail to start LFSCK for namespace"
3782 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3783 mdd.${MDT_DEV}.lfsck_namespace |
3784 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3786 error "(15) unexpected status"
3789 local repaired=$($SHOW_NAMESPACE |
3790 awk '/^local_lost_found_moved/ { print $2 }')
3791 [ $repaired -ge 4 ] ||
3792 error "(16) Fail to recover backend orphans: $repaired"
3794 mount_client $MOUNT || error "(17) Fail to start client!"
3796 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3798 ls -ail $MOUNT/.lustre/lost+found/
3800 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3801 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3802 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3804 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3806 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3807 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3809 stat ${cname}/d1 || error "(21) d0 is not recovered"
3810 stat ${cname}/f1 || error "(22) f1 is not recovered"
3812 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3815 [ $MDSCOUNT -lt 2 ] &&
3816 skip "The test needs at least 2 MDTs" && return
3819 echo "For the name entry under a striped directory, if the name"
3820 echo "hash does not match the shard, then the LFSCK will repair"
3821 echo "the bad name entry"
3824 check_mount_and_prep
3826 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3827 error "(1) Fail to create striped directory"
3829 echo "Inject failure stub on client to simulate the case that"
3830 echo "some name entry should be inserted into other non-first"
3831 echo "shard, but inserted into the first shard by wrong"
3833 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3834 $LCTL set_param fail_loc=0x1628 fail_val=0
3835 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3836 error "(2) Fail to create file under striped directory"
3837 $LCTL set_param fail_loc=0 fail_val=0
3839 echo "Trigger namespace LFSCK to repair bad name hash"
3840 $START_NAMESPACE -r -A ||
3841 error "(3) Fail to start LFSCK for namespace"
3843 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3844 mdd.${MDT_DEV}.lfsck_namespace |
3845 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3847 error "(4) unexpected status"
3850 local repaired=$($SHOW_NAMESPACE |
3851 awk '/^name_hash_repaired/ { print $2 }')
3852 [ $repaired -ge 1 ] ||
3853 error "(5) Fail to repair bad name hash: $repaired"
3855 umount_client $MOUNT || error "(6) umount failed"
3856 mount_client $MOUNT || error "(7) mount failed"
3858 for ((i = 0; i < $MDSCOUNT; i++)); do
3859 stat $DIR/$tdir/striped_dir/d$i ||
3860 error "(8) Fail to stat d$i after LFSCK"
3861 rmdir $DIR/$tdir/striped_dir/d$i ||
3862 error "(9) Fail to unlink d$i after LFSCK"
3865 rmdir $DIR/$tdir/striped_dir ||
3866 error "(10) Fail to remove the striped directory after LFSCK"
3868 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3871 [ $MDSCOUNT -lt 2 ] &&
3872 skip "The test needs at least 2 MDTs" && return
3875 echo "For the name entry under a striped directory, if the name"
3876 echo "hash does not match the shard, then the LFSCK will repair"
3877 echo "the bad name entry"
3880 check_mount_and_prep
3882 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3883 error "(1) Fail to create striped directory"
3885 echo "Inject failure stub on client to simulate the case that"
3886 echo "some name entry should be inserted into other non-second"
3887 echo "shard, but inserted into the secod shard by wrong"
3889 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3890 $LCTL set_param fail_loc=0x1628 fail_val=1
3891 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3892 error "(2) Fail to create file under striped directory"
3893 $LCTL set_param fail_loc=0 fail_val=0
3895 echo "Trigger namespace LFSCK to repair bad name hash"
3896 $START_NAMESPACE -r -A ||
3897 error "(3) Fail to start LFSCK for namespace"
3899 wait_update_facet mds2 "$LCTL get_param -n \
3900 mdd.$(facet_svc mds2).lfsck_namespace |
3901 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3902 error "(4) unexpected status"
3904 local repaired=$(do_facet mds2 $LCTL get_param -n \
3905 mdd.$(facet_svc mds2).lfsck_namespace |
3906 awk '/^name_hash_repaired/ { print $2 }')
3907 [ $repaired -ge 1 ] ||
3908 error "(5) Fail to repair bad name hash: $repaired"
3910 umount_client $MOUNT || error "(6) umount failed"
3911 mount_client $MOUNT || error "(7) mount failed"
3913 for ((i = 0; i < $MDSCOUNT; i++)); do
3914 stat $DIR/$tdir/striped_dir/d$i ||
3915 error "(8) Fail to stat d$i after LFSCK"
3916 rmdir $DIR/$tdir/striped_dir/d$i ||
3917 error "(9) Fail to unlink d$i after LFSCK"
3920 rmdir $DIR/$tdir/striped_dir ||
3921 error "(10) Fail to remove the striped directory after LFSCK"
3923 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3926 [ $MDSCOUNT -lt 2 ] &&
3927 skip "The test needs at least 2 MDTs" && return
3930 echo "For some reason, the master MDT-object of the striped directory"
3931 echo "may lost its master LMV EA. If nobody created files under the"
3932 echo "master directly after the master LMV EA lost, then the LFSCK"
3933 echo "should re-generate the master LMV EA."
3936 check_mount_and_prep
3938 echo "Inject failure stub on MDT0 to simulate the case that the"
3939 echo "master MDT-object of the striped directory lost the LMV EA."
3941 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3942 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3943 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3944 error "(1) Fail to create striped directory"
3945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3947 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3948 $START_NAMESPACE -r -A ||
3949 error "(2) Fail to start LFSCK for namespace"
3951 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3952 mdd.${MDT_DEV}.lfsck_namespace |
3953 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3955 error "(3) unexpected status"
3958 local repaired=$($SHOW_NAMESPACE |
3959 awk '/^striped_dirs_repaired/ { print $2 }')
3960 [ $repaired -eq 1 ] ||
3961 error "(4) Fail to re-generate master LMV EA: $repaired"
3963 umount_client $MOUNT || error "(5) umount failed"
3964 mount_client $MOUNT || error "(6) mount failed"
3966 local empty=$(ls $DIR/$tdir/striped_dir/)
3967 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3969 rmdir $DIR/$tdir/striped_dir ||
3970 error "(8) Fail to remove the striped directory after LFSCK"
3972 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3975 [ $MDSCOUNT -lt 2 ] &&
3976 skip "The test needs at least 2 MDTs" && return
3979 echo "For some reason, the master MDT-object of the striped directory"
3980 echo "may lost its master LMV EA. If somebody created files under the"
3981 echo "master directly after the master LMV EA lost, then the LFSCK"
3982 echo "should NOT re-generate the master LMV EA, instead, it should"
3983 echo "change the broken striped dirctory as read-only to prevent"
3984 echo "further damage"
3987 check_mount_and_prep
3989 echo "Inject failure stub on MDT0 to simulate the case that the"
3990 echo "master MDT-object of the striped directory lost the LMV EA."
3992 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3993 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3994 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3995 error "(1) Fail to create striped directory"
3996 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
3998 umount_client $MOUNT || error "(2) umount failed"
3999 mount_client $MOUNT || error "(3) mount failed"
4001 touch $DIR/$tdir/striped_dir/dummy ||
4002 error "(4) Fail to touch under broken striped directory"
4004 echo "Trigger namespace LFSCK to find out the inconsistency"
4005 $START_NAMESPACE -r -A ||
4006 error "(5) Fail to start LFSCK for namespace"
4008 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4009 mdd.${MDT_DEV}.lfsck_namespace |
4010 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4012 error "(6) unexpected status"
4015 local repaired=$($SHOW_NAMESPACE |
4016 awk '/^striped_dirs_repaired/ { print $2 }')
4017 [ $repaired -eq 0 ] ||
4018 error "(7) Re-generate master LMV EA unexpected: $repaired"
4020 stat $DIR/$tdir/striped_dir/dummy ||
4021 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4023 touch $DIR/$tdir/striped_dir/foo &&
4024 error "(9) The broken striped directory should be read-only"
4026 chattr -i $DIR/$tdir/striped_dir ||
4027 error "(10) Fail to chattr on the broken striped directory"
4029 rmdir $DIR/$tdir/striped_dir ||
4030 error "(11) Fail to remove the striped directory after LFSCK"
4032 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4035 [ $MDSCOUNT -lt 2 ] &&
4036 skip "The test needs at least 2 MDTs" && return
4039 echo "For some reason, the slave MDT-object of the striped directory"
4040 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4041 echo "slave LMV EA."
4044 check_mount_and_prep
4046 echo "Inject failure stub on MDT0 to simulate the case that the"
4047 echo "slave MDT-object (that resides on the same MDT as the master"
4048 echo "MDT-object resides on) lost the LMV EA."
4050 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4051 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4052 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4053 error "(1) Fail to create striped directory"
4054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4056 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4057 $START_NAMESPACE -r -A ||
4058 error "(2) Fail to start LFSCK for namespace"
4060 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4061 mdd.${MDT_DEV}.lfsck_namespace |
4062 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4064 error "(3) unexpected status"
4067 local repaired=$($SHOW_NAMESPACE |
4068 awk '/^striped_shards_repaired/ { print $2 }')
4069 [ $repaired -eq 1 ] ||
4070 error "(4) Fail to re-generate slave LMV EA: $repaired"
4072 rmdir $DIR/$tdir/striped_dir ||
4073 error "(5) Fail to remove the striped directory after LFSCK"
4075 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4078 [ $MDSCOUNT -lt 2 ] &&
4079 skip "The test needs at least 2 MDTs" && return
4082 echo "For some reason, the slave MDT-object of the striped directory"
4083 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4084 echo "slave LMV EA."
4087 check_mount_and_prep
4089 echo "Inject failure stub on MDT0 to simulate the case that the"
4090 echo "slave MDT-object (that resides on differnt MDT as the master"
4091 echo "MDT-object resides on) lost the LMV EA."
4093 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4094 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4095 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4096 error "(1) Fail to create striped directory"
4097 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4099 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4100 $START_NAMESPACE -r -A ||
4101 error "(2) Fail to start LFSCK for namespace"
4103 wait_update_facet mds2 "$LCTL get_param -n \
4104 mdd.$(facet_svc mds2).lfsck_namespace |
4105 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4106 error "(3) unexpected status"
4108 local repaired=$(do_facet mds2 $LCTL get_param -n \
4109 mdd.$(facet_svc mds2).lfsck_namespace |
4110 awk '/^striped_shards_repaired/ { print $2 }')
4111 [ $repaired -eq 1 ] ||
4112 error "(4) Fail to re-generate slave LMV EA: $repaired"
4114 rmdir $DIR/$tdir/striped_dir ||
4115 error "(5) Fail to remove the striped directory after LFSCK"
4117 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4120 [ $MDSCOUNT -lt 2 ] &&
4121 skip "The test needs at least 2 MDTs" && return
4124 echo "For some reason, the stripe index in the slave LMV EA is"
4125 echo "corrupted. The LFSCK should repair the slave LMV EA."
4128 check_mount_and_prep
4130 echo "Inject failure stub on MDT0 to simulate the case that the"
4131 echo "slave LMV EA on the first shard of the striped directory"
4132 echo "claims the same index as the second shard claims"
4134 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4135 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4136 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4137 error "(1) Fail to create striped directory"
4138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4140 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4141 $START_NAMESPACE -r -A ||
4142 error "(2) Fail to start LFSCK for namespace"
4144 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4145 mdd.${MDT_DEV}.lfsck_namespace |
4146 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4148 error "(3) unexpected status"
4151 local repaired=$($SHOW_NAMESPACE |
4152 awk '/^striped_shards_repaired/ { print $2 }')
4153 [ $repaired -eq 1 ] ||
4154 error "(4) Fail to repair slave LMV EA: $repaired"
4156 umount_client $MOUNT || error "(5) umount failed"
4157 mount_client $MOUNT || error "(6) mount failed"
4159 touch $DIR/$tdir/striped_dir/foo ||
4160 error "(7) Fail to touch file after the LFSCK"
4162 rm -f $DIR/$tdir/striped_dir/foo ||
4163 error "(8) Fail to unlink file after the LFSCK"
4165 rmdir $DIR/$tdir/striped_dir ||
4166 error "(9) Fail to remove the striped directory after LFSCK"
4168 run_test 31g "Repair the corrupted slave LMV EA"
4171 [ $MDSCOUNT -lt 2 ] &&
4172 skip "The test needs at least 2 MDTs" && return
4175 echo "For some reason, the shard's name entry in the striped"
4176 echo "directory may be corrupted. The LFSCK should repair the"
4177 echo "bad shard's name entry."
4180 check_mount_and_prep
4182 echo "Inject failure stub on MDT0 to simulate the case that the"
4183 echo "first shard's name entry in the striped directory claims"
4184 echo "the same index as the second shard's name entry claims."
4186 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4188 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4189 error "(1) Fail to create striped directory"
4190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4192 echo "Trigger namespace LFSCK to repair the shard's name entry"
4193 $START_NAMESPACE -r -A ||
4194 error "(2) Fail to start LFSCK for namespace"
4196 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4197 mdd.${MDT_DEV}.lfsck_namespace |
4198 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4200 error "(3) unexpected status"
4203 local repaired=$($SHOW_NAMESPACE |
4204 awk '/^dirent_repaired/ { print $2 }')
4205 [ $repaired -eq 1 ] ||
4206 error "(4) Fail to repair shard's name entry: $repaired"
4208 umount_client $MOUNT || error "(5) umount failed"
4209 mount_client $MOUNT || error "(6) mount failed"
4211 touch $DIR/$tdir/striped_dir/foo ||
4212 error "(7) Fail to touch file after the LFSCK"
4214 rm -f $DIR/$tdir/striped_dir/foo ||
4215 error "(8) Fail to unlink file after the LFSCK"
4217 rmdir $DIR/$tdir/striped_dir ||
4218 error "(9) Fail to remove the striped directory after LFSCK"
4220 run_test 31h "Repair the corrupted shard's name entry"
4222 # restore MDS/OST size
4223 MDSSIZE=${SAVED_MDSSIZE}
4224 OSTSIZE=${SAVED_OSTSIZE}
4225 OSTCOUNT=${SAVED_OSTCOUNT}
4227 # cleanup the system at last