3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
24 SAVED_MDSSIZE=${MDSSIZE}
25 SAVED_OSTSIZE=${OSTSIZE}
26 SAVED_OSTCOUNT=${OSTCOUNT}
27 # use small MDS + OST size to speed formatting time
28 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
31 # no need too much OSTs, to reduce the format/start/stop overhead
32 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
34 # build up a clean test environment.
38 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
39 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
42 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
43 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
45 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
53 $LCTL set_param debug=+lfsck > /dev/null || true
55 MDT_DEV="${FSNAME}-MDT0000"
56 OST_DEV="${FSNAME}-OST0000"
57 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
58 START_NAMESPACE="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
60 START_LAYOUT="do_facet $SINGLEMDS \
61 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
62 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
63 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
64 SHOW_NAMESPACE="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
66 SHOW_LAYOUT="do_facet $SINGLEMDS \
67 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
68 SHOW_LAYOUT_ON_OST="do_facet ost1 \
69 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
70 MOUNT_OPTS_SCRUB="-o user_xattr"
71 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
110 #define OBD_FAIL_LFSCK_DELAY1 0x1600
111 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
112 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
114 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
116 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
117 [ "$STATUS" == "scanning-phase1" ] ||
118 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
120 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
122 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
123 [ "$STATUS" == "stopped" ] ||
124 error "(6) Expect 'stopped', but got '$STATUS'"
126 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
128 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
129 [ "$STATUS" == "scanning-phase1" ] ||
130 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
132 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
133 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
134 mdd.${MDT_DEV}.lfsck_namespace |
135 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
137 error "(9) unexpected status"
140 local repaired=$($SHOW_NAMESPACE |
141 awk '/^updated_phase1/ { print $2 }')
142 [ $repaired -eq 0 ] ||
143 error "(10) Expect nothing to be repaired, but got: $repaired"
145 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
146 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
147 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
148 mdd.${MDT_DEV}.lfsck_namespace |
149 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
151 error "(12) unexpected status"
154 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
155 [ $((scanned1 + 1)) -eq $scanned2 ] ||
156 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
158 echo "stopall, should NOT crash LU-3649"
159 stopall || error "(14) Fail to stopall"
161 run_test 0 "Control LFSCK manually"
164 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
165 skip "OI Scrub not implemented for ZFS" && return
169 #define OBD_FAIL_FID_INDIR 0x1501
170 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
171 touch $DIR/$tdir/dummy
173 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
175 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
176 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
177 mdd.${MDT_DEV}.lfsck_namespace |
178 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
180 error "(4) unexpected status"
183 local repaired=$($SHOW_NAMESPACE |
184 awk '/^dirent_repaired/ { print $2 }')
185 # for interop with old server
186 [ -z "$repaired" ] &&
187 repaired=$($SHOW_NAMESPACE |
188 awk '/^updated_phase1/ { print $2 }')
190 [ $repaired -eq 1 ] ||
191 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
193 mount_client $MOUNT || error "(6) Fail to start client!"
195 #define OBD_FAIL_FID_LOOKUP 0x1505
196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
197 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
199 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
201 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
205 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
206 skip "OI Scrub not implemented for ZFS" && return
210 #define OBD_FAIL_FID_INLMA 0x1502
211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
212 touch $DIR/$tdir/dummy
214 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
216 #define OBD_FAIL_FID_NOLMA 0x1506
217 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
218 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
219 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
220 mdd.${MDT_DEV}.lfsck_namespace |
221 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
223 error "(4) unexpected status"
226 local repaired=$($SHOW_NAMESPACE |
227 awk '/^dirent_repaired/ { print $2 }')
228 # for interop with old server
229 [ -z "$repaired" ] &&
230 repaired=$($SHOW_NAMESPACE |
231 awk '/^updated_phase1/ { print $2 }')
233 [ $repaired -eq 1 ] ||
234 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
236 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
237 mount_client $MOUNT || error "(6) Fail to start client!"
239 #define OBD_FAIL_FID_LOOKUP 0x1505
240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
241 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
243 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
245 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
250 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
252 touch $DIR/$tdir/dummy
254 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
256 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
257 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
258 mdd.${MDT_DEV}.lfsck_namespace |
259 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
261 error "(4) unexpected status"
264 local repaired=$($SHOW_NAMESPACE |
265 awk '/^linkea_repaired/ { print $2 }')
266 # for interop with old server
267 [ -z "$repaired" ] &&
268 repaired=$($SHOW_NAMESPACE |
269 awk '/^updated_phase2/ { print $2 }')
271 [ $repaired -eq 1 ] ||
272 error "(5) Fail to repair crashed linkEA: $repaired"
274 mount_client $MOUNT || error "(6) Fail to start client!"
276 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
277 error "(7) Fail to stat $DIR/$tdir/dummy"
279 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
280 local dummyname=$($LFS fid2path $DIR $dummyfid)
281 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
282 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
284 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
290 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
291 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
292 touch $DIR/$tdir/dummy
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
296 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
297 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
298 mdd.${MDT_DEV}.lfsck_namespace |
299 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
301 error "(4) unexpected status"
304 local repaired=$($SHOW_NAMESPACE |
305 awk '/^updated_phase2/ { print $2 }')
306 [ $repaired -eq 1 ] ||
307 error "(5) Fail to repair crashed linkEA: $repaired"
309 mount_client $MOUNT || error "(6) Fail to start client!"
311 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
312 error "(7) Fail to stat $DIR/$tdir/dummy"
314 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
315 local dummyname=$($LFS fid2path $DIR $dummyfid)
316 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
317 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
319 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
325 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
326 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
327 touch $DIR/$tdir/dummy
329 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
331 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
332 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
333 mdd.${MDT_DEV}.lfsck_namespace |
334 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
336 error "(4) unexpected status"
339 local repaired=$($SHOW_NAMESPACE |
340 awk '/^updated_phase2/ { print $2 }')
341 [ $repaired -eq 1 ] ||
342 error "(5) Fail to repair crashed linkEA: $repaired"
344 mount_client $MOUNT || error "(6) Fail to start client!"
346 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
347 error "(7) Fail to stat $DIR/$tdir/dummy"
349 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
350 local dummyname=$($LFS fid2path $DIR $dummyfid)
351 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
352 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
354 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
360 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
361 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
362 touch $DIR/$tdir/dummy
364 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
366 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
367 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
368 mdd.${MDT_DEV}.lfsck_namespace |
369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
371 error "(4) unexpected status"
374 local repaired=$($SHOW_NAMESPACE |
375 awk '/^linkea_repaired/ { print $2 }')
376 [ $repaired -eq 1 ] ||
377 error "(5) Fail to repair crashed linkEA: $repaired"
379 mount_client $MOUNT || error "(6) Fail to start client!"
381 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
382 error "(7) Fail to stat $DIR/$tdir/dummy"
384 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
385 local dummyname=$($LFS fid2path $DIR $dummyfid)
386 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
387 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
389 run_test 2d "LFSCK can recover the missing linkEA entry"
393 [ $MDSCOUNT -lt 2 ] &&
394 skip "We need at least 2 MDSes for this test" && return
398 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
400 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
401 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
402 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
405 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
406 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
407 mdd.${MDT_DEV}.lfsck_namespace |
408 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
410 error "(4) unexpected status"
413 local repaired=$($SHOW_NAMESPACE |
414 awk '/^linkea_repaired/ { print $2 }')
415 [ $repaired -eq 1 ] ||
416 error "(5) Fail to repair crashed linkEA: $repaired"
418 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
419 local name=$($LFS fid2path $DIR $fid)
420 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
421 error "(6) Fail to repair linkEA: $fid $name"
423 run_test 2e "namespace LFSCK can verify remote object linkEA"
429 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
430 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
431 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
433 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
434 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
435 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
437 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
439 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
441 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
443 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
445 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
447 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
448 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
449 mdd.${MDT_DEV}.lfsck_namespace |
450 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
452 error "(10) unexpected status"
455 local checked=$($SHOW_NAMESPACE |
456 awk '/^checked_phase2/ { print $2 }')
457 [ $checked -ge 4 ] ||
458 error "(11) Fail to check multiple-linked object: $checked"
460 local repaired=$($SHOW_NAMESPACE |
461 awk '/^multiple_linked_repaired/ { print $2 }')
462 [ $repaired -ge 2 ] ||
463 error "(12) Fail to repair multiple-linked object: $repaired"
465 run_test 3 "LFSCK can verify multiple-linked objects"
469 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
470 skip "OI Scrub not implemented for ZFS" && return
473 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
474 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
476 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
477 echo "start $SINGLEMDS with disabling OI scrub"
478 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
479 error "(2) Fail to start MDS!"
481 #define OBD_FAIL_LFSCK_DELAY2 0x1601
482 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
483 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
484 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
485 mdd.${MDT_DEV}.lfsck_namespace |
486 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
488 error "(5) unexpected status"
491 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
492 [ "$STATUS" == "scanning-phase1" ] ||
493 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
495 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(7) unexpected status"
503 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
504 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
506 local repaired=$($SHOW_NAMESPACE |
507 awk '/^dirent_repaired/ { print $2 }')
508 # for interop with old server
509 [ -z "$repaired" ] &&
510 repaired=$($SHOW_NAMESPACE |
511 awk '/^updated_phase1/ { print $2 }')
513 [ $repaired -ge 9 ] ||
514 error "(9) Fail to re-generate FID-in-dirent: $repaired"
516 mount_client $MOUNT || error "(10) Fail to start client!"
518 #define OBD_FAIL_FID_LOOKUP 0x1505
519 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
520 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
521 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
523 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
527 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
528 skip "OI Scrub not implemented for ZFS" && return
531 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
532 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
534 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
535 echo "start $SINGLEMDS with disabling OI scrub"
536 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
537 error "(2) Fail to start MDS!"
539 #define OBD_FAIL_LFSCK_DELAY2 0x1601
540 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
541 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
542 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
543 mdd.${MDT_DEV}.lfsck_namespace |
544 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
546 error "(5) unexpected status"
549 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
550 [ "$STATUS" == "scanning-phase1" ] ||
551 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
554 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
555 mdd.${MDT_DEV}.lfsck_namespace |
556 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
558 error "(7) unexpected status"
561 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
562 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
564 local repaired=$($SHOW_NAMESPACE |
565 awk '/^dirent_repaired/ { print $2 }')
566 # for interop with old server
567 [ -z "$repaired" ] &&
568 repaired=$($SHOW_NAMESPACE |
569 awk '/^updated_phase1/ { print $2 }')
571 [ $repaired -ge 2 ] ||
572 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
574 mount_client $MOUNT || error "(10) Fail to start client!"
576 #define OBD_FAIL_FID_LOOKUP 0x1505
577 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
578 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
580 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
583 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
584 local dummyname=$($LFS fid2path $DIR $dummyfid)
585 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
586 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
588 run_test 5 "LFSCK can handle IGIF object upgrading"
593 #define OBD_FAIL_LFSCK_DELAY1 0x1600
594 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
595 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
597 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
598 [ "$STATUS" == "scanning-phase1" ] ||
599 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
601 # Sleep 3 sec to guarantee at least one object processed by LFSCK
603 # Fail the LFSCK to guarantee there is at least one checkpoint
604 #define OBD_FAIL_LFSCK_FATAL1 0x1608
605 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
607 mdd.${MDT_DEV}.lfsck_namespace |
608 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
610 error "(4) unexpected status"
613 local POS0=$($SHOW_NAMESPACE |
614 awk '/^last_checkpoint_position/ { print $2 }' |
617 #define OBD_FAIL_LFSCK_DELAY1 0x1600
618 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
619 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
621 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
622 [ "$STATUS" == "scanning-phase1" ] ||
623 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
625 local POS1=$($SHOW_NAMESPACE |
626 awk '/^latest_start_position/ { print $2 }' |
628 [[ $POS0 -lt $POS1 ]] ||
629 error "(7) Expect larger than: $POS0, but got $POS1"
631 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
632 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
633 mdd.${MDT_DEV}.lfsck_namespace |
634 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
636 error "(8) unexpected status"
639 run_test 6a "LFSCK resumes from last checkpoint (1)"
644 #define OBD_FAIL_LFSCK_DELAY2 0x1601
645 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
646 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
648 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
649 [ "$STATUS" == "scanning-phase1" ] ||
650 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
652 # Sleep 5 sec to guarantee that we are in the directory scanning
654 # Fail the LFSCK to guarantee there is at least one checkpoint
655 #define OBD_FAIL_LFSCK_FATAL2 0x1609
656 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
657 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
658 mdd.${MDT_DEV}.lfsck_namespace |
659 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
661 error "(4) unexpected status"
664 local O_POS0=$($SHOW_NAMESPACE |
665 awk '/^last_checkpoint_position/ { print $2 }' |
668 local D_POS0=$($SHOW_NAMESPACE |
669 awk '/^last_checkpoint_position/ { print $4 }')
671 #define OBD_FAIL_LFSCK_DELAY2 0x1601
672 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
673 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
675 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
676 [ "$STATUS" == "scanning-phase1" ] ||
677 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
679 local O_POS1=$($SHOW_NAMESPACE |
680 awk '/^latest_start_position/ { print $2 }' |
682 local D_POS1=$($SHOW_NAMESPACE |
683 awk '/^latest_start_position/ { print $4 }')
685 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
686 [[ $O_POS0 -lt $O_POS1 ]] ||
687 error "(7.1) $O_POS1 is not larger than $O_POS0"
689 [[ $D_POS0 -lt $D_POS1 ]] ||
690 error "(7.2) $D_POS1 is not larger than $D_POS0"
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
694 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
695 mdd.${MDT_DEV}.lfsck_namespace |
696 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
698 error "(8) unexpected status"
701 run_test 6b "LFSCK resumes from last checkpoint (2)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 3 sec to guarantee at least one object processed by LFSCK
718 echo "stop $SINGLEMDS"
719 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
721 echo "start $SINGLEMDS"
722 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
723 error "(5) Fail to start MDS!"
725 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
726 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
727 mdd.${MDT_DEV}.lfsck_namespace |
728 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
730 error "(6) unexpected status"
733 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
739 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
741 for ((i = 0; i < 20; i++)); do
742 touch $DIR/$tdir/dummy${i}
745 #define OBD_FAIL_LFSCK_DELAY3 0x1602
746 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
747 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
748 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
749 mdd.${MDT_DEV}.lfsck_namespace |
750 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
752 error "(4) unexpected status"
755 echo "stop $SINGLEMDS"
756 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
758 echo "start $SINGLEMDS"
759 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
760 error "(6) Fail to start MDS!"
762 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
764 mdd.${MDT_DEV}.lfsck_namespace |
765 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
767 error "(7) unexpected status"
770 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
775 formatall > /dev/null
781 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
782 [ "$STATUS" == "init" ] ||
783 error "(2) Expect 'init', but got '$STATUS'"
785 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
786 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
787 mkdir $DIR/$tdir/crashed
789 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
790 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
791 for ((i = 0; i < 5; i++)); do
792 touch $DIR/$tdir/dummy${i}
795 umount_client $MOUNT || error "(3) Fail to stop client!"
797 #define OBD_FAIL_LFSCK_DELAY2 0x1601
798 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
799 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
801 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
802 [ "$STATUS" == "scanning-phase1" ] ||
803 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
805 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
807 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
808 [ "$STATUS" == "stopped" ] ||
809 error "(7) Expect 'stopped', but got '$STATUS'"
811 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
813 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
814 [ "$STATUS" == "scanning-phase1" ] ||
815 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
817 #define OBD_FAIL_LFSCK_FATAL2 0x1609
818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
819 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
820 mdd.${MDT_DEV}.lfsck_namespace |
821 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
823 error "(10) unexpected status"
826 #define OBD_FAIL_LFSCK_DELAY1 0x1600
827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
828 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
830 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
831 [ "$STATUS" == "scanning-phase1" ] ||
832 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
834 #define OBD_FAIL_LFSCK_CRASH 0x160a
835 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
838 echo "stop $SINGLEMDS"
839 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
841 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
844 echo "start $SINGLEMDS"
845 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
846 error "(14) Fail to start MDS!"
848 local timeout=$(max_recovery_time)
851 while [ $timer -lt $timeout ]; do
852 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
853 mdt.${MDT_DEV}.recovery_status |
854 awk '/^status/ { print \\\$2 }'")
855 [ "$STATUS" != "RECOVERING" ] && break;
860 [ $timer != $timeout ] ||
861 error "(14.1) recovery timeout"
863 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
864 [ "$STATUS" == "crashed" ] ||
865 error "(15) Expect 'crashed', but got '$STATUS'"
867 #define OBD_FAIL_LFSCK_DELAY2 0x1601
868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
869 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "scanning-phase1" ] ||
873 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
875 echo "stop $SINGLEMDS"
876 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
878 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
879 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
881 echo "start $SINGLEMDS"
882 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
883 error "(19) Fail to start MDS!"
886 while [ $timer -lt $timeout ]; do
887 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
888 mdt.${MDT_DEV}.recovery_status |
889 awk '/^status/ { print \\\$2 }'")
890 [ "$STATUS" != "RECOVERING" ] && break;
895 [ $timer != $timeout ] ||
896 error "(19.1) recovery timeout"
898 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
899 [ "$STATUS" == "paused" ] ||
900 error "(20) Expect 'paused', but got '$STATUS'"
902 #define OBD_FAIL_LFSCK_DELAY3 0x1602
903 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
905 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
906 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
907 mdd.${MDT_DEV}.lfsck_namespace |
908 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
910 error "(22) unexpected status"
913 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
914 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
915 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
917 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
918 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
919 mdd.${MDT_DEV}.lfsck_namespace |
920 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
922 error "(24) unexpected status"
925 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
926 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
928 run_test 8 "LFSCK state machine"
931 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
932 skip "Testing on UP system, the speed may be inaccurate."
938 local BASE_SPEED1=100
940 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
943 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
944 [ "$STATUS" == "scanning-phase1" ] ||
945 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
947 local SPEED=$($SHOW_NAMESPACE |
948 awk '/^average_speed_phase1/ { print $2 }')
950 # There may be time error, normally it should be less than 2 seconds.
951 # We allow another 20% schedule error.
953 # MAX_MARGIN = 1.2 = 12 / 10
954 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
955 RUN_TIME1 * 12 / 10))
956 [ $SPEED -lt $MAX_SPEED ] ||
957 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
960 local BASE_SPEED2=300
962 do_facet $SINGLEMDS \
963 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
966 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
967 # MIN_MARGIN = 0.8 = 8 / 10
968 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
969 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
970 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
971 [ $SPEED -gt $MIN_SPEED ] || {
972 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
973 error_ignore LU-5624 \
974 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
977 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
981 # MAX_MARGIN = 1.2 = 12 / 10
982 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
983 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
984 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
985 [ $SPEED -lt $MAX_SPEED ] ||
986 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
988 do_facet $SINGLEMDS \
989 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
991 wait_update_facet $SINGLEMDS \
992 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
993 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
994 error "(7) Failed to get expected 'completed'"
996 run_test 9a "LFSCK speed control (1)"
999 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1000 skip "Testing on UP system, the speed may be inaccurate."
1006 echo "Preparing another 50 * 50 files (with error) at $(date)."
1007 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1009 createmany -d $DIR/$tdir/d 50
1010 createmany -m $DIR/$tdir/f 50
1011 for ((i = 0; i < 50; i++)); do
1012 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1015 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1016 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1017 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1018 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1019 mdd.${MDT_DEV}.lfsck_namespace |
1020 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1022 error "(5) unexpected status"
1025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1026 echo "Prepared at $(date)."
1028 local BASE_SPEED1=50
1030 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1033 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1034 [ "$STATUS" == "scanning-phase2" ] ||
1035 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1037 local SPEED=$($SHOW_NAMESPACE |
1038 awk '/^average_speed_phase2/ { print $2 }')
1039 # There may be time error, normally it should be less than 2 seconds.
1040 # We allow another 20% schedule error.
1042 # MAX_MARGIN = 1.2 = 12 / 10
1043 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1044 RUN_TIME1 * 12 / 10))
1045 [ $SPEED -lt $MAX_SPEED ] ||
1046 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1048 # adjust speed limit
1049 local BASE_SPEED2=150
1051 do_facet $SINGLEMDS \
1052 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1055 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1056 # MIN_MARGIN = 0.8 = 8 / 10
1057 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1058 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1059 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1060 [ $SPEED -gt $MIN_SPEED ] || {
1061 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1062 error_ignore LU-5624 \
1063 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1066 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1070 # MAX_MARGIN = 1.2 = 12 / 10
1071 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1072 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1073 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1074 [ $SPEED -lt $MAX_SPEED ] ||
1075 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1077 do_facet $SINGLEMDS \
1078 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1079 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1080 mdd.${MDT_DEV}.lfsck_namespace |
1081 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1083 error "(11) unexpected status"
1086 run_test 9b "LFSCK speed control (2)"
1090 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1091 skip "lookup(..)/linkea on ZFS issue" && return
1095 echo "Preparing more files with error at $(date)."
1096 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1097 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1099 for ((i = 0; i < 1000; i = $((i+2)))); do
1100 mkdir -p $DIR/$tdir/d${i}
1101 touch $DIR/$tdir/f${i}
1102 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1108 for ((i = 1; i < 1000; i = $((i+2)))); do
1109 mkdir -p $DIR/$tdir/d${i}
1110 touch $DIR/$tdir/f${i}
1111 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1115 echo "Prepared at $(date)."
1117 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1119 umount_client $MOUNT
1120 mount_client $MOUNT || error "(3) Fail to start client!"
1122 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1125 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1126 [ "$STATUS" == "scanning-phase1" ] ||
1127 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1129 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1131 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1133 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1135 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1137 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1139 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1141 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1143 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1144 error "(14) Fail to softlink!"
1146 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1147 [ "$STATUS" == "scanning-phase1" ] ||
1148 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1150 do_facet $SINGLEMDS \
1151 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1152 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1153 mdd.${MDT_DEV}.lfsck_namespace |
1154 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1156 error "(16) unexpected status"
1159 run_test 10 "System is available during LFSCK scanning"
1162 ost_remove_lastid() {
1165 local rcmd="do_facet ost${ost}"
1167 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1169 # step 1: local mount
1170 mount_fstype ost${ost} || return 1
1171 # step 2: remove the specified LAST_ID
1172 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1174 unmount_fstype ost${ost} || return 2
1178 check_mount_and_prep
1179 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1180 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1185 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1187 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1188 error "(2) Fail to start ost1"
1190 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1191 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1193 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1194 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1196 wait_update_facet ost1 "$LCTL get_param -n \
1197 obdfilter.${OST_DEV}.lfsck_layout |
1198 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1200 error "(5) unexpected status"
1203 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1205 wait_update_facet ost1 "$LCTL get_param -n \
1206 obdfilter.${OST_DEV}.lfsck_layout |
1207 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1209 error "(6) unexpected status"
1212 echo "the LAST_ID(s) should have been rebuilt"
1213 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1214 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1216 run_test 11a "LFSCK can rebuild lost last_id"
1219 check_mount_and_prep
1220 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1222 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1223 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1224 do_facet ost1 $LCTL set_param fail_loc=0x160d
1225 createmany -o $DIR/$tdir/f 64
1226 local lastid1=$(do_facet ost1 "lctl get_param -n \
1227 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1228 awk -F: '{ print $2 }')
1230 umount_client $MOUNT
1231 stop ost1 || error "(1) Fail to stop ost1"
1233 #define OBD_FAIL_OST_ENOSPC 0x215
1234 do_facet ost1 $LCTL set_param fail_loc=0x215
1236 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1237 error "(2) Fail to start ost1"
1239 for ((i = 0; i < 60; i++)); do
1240 lastid2=$(do_facet ost1 "lctl get_param -n \
1241 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1242 awk -F: '{ print $2 }')
1243 [ ! -z $lastid2 ] && break;
1247 echo "the on-disk LAST_ID should be smaller than the expected one"
1248 [ $lastid1 -gt $lastid2 ] ||
1249 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1251 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1252 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1254 wait_update_facet ost1 "$LCTL get_param -n \
1255 obdfilter.${OST_DEV}.lfsck_layout |
1256 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1258 error "(6) unexpected status"
1261 stop ost1 || error "(7) Fail to stop ost1"
1263 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1264 error "(8) Fail to start ost1"
1266 echo "the on-disk LAST_ID should have been rebuilt"
1267 wait_update_facet ost1 "$LCTL get_param -n \
1268 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1269 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1270 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1271 error "(9) expect lastid1 0x100000000:$lastid1"
1274 do_facet ost1 $LCTL set_param fail_loc=0
1275 stopall || error "(10) Fail to stopall"
1277 run_test 11b "LFSCK can rebuild crashed last_id"
1280 [ $MDSCOUNT -lt 2 ] &&
1281 skip "We need at least 2 MDSes for test_12" && return
1283 check_mount_and_prep
1284 for k in $(seq $MDSCOUNT); do
1285 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1286 createmany -o $DIR/$tdir/${k}/f 100 ||
1287 error "(0) Fail to create 100 files."
1290 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1291 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1292 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1294 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1295 for k in $(seq $MDSCOUNT); do
1296 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1297 mdd.$(facet_svc mds${k}).lfsck_namespace |
1298 awk '/^status/ { print $2 }')
1299 [ "$STATUS" == "scanning-phase1" ] ||
1300 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1303 echo "Stop namespace LFSCK on all targets by single lctl command."
1304 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1305 error "(4) Fail to stop LFSCK on all devices!"
1307 echo "All the LFSCK targets should be in 'stopped' status."
1308 for k in $(seq $MDSCOUNT); do
1309 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1310 mdd.$(facet_svc mds${k}).lfsck_namespace |
1311 awk '/^status/ { print $2 }')
1312 [ "$STATUS" == "stopped" ] ||
1313 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1316 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1317 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1318 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1320 echo "All the LFSCK targets should be in 'completed' status."
1321 for k in $(seq $MDSCOUNT); do
1322 wait_update_facet mds${k} "$LCTL get_param -n \
1323 mdd.$(facet_svc mds${k}).lfsck_namespace |
1324 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1325 error "(7) MDS${k} is not the expected 'completed'"
1328 echo "Start layout LFSCK on all targets by single command (-s 1)."
1329 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1330 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1332 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1333 for k in $(seq $MDSCOUNT); do
1334 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1335 mdd.$(facet_svc mds${k}).lfsck_layout |
1336 awk '/^status/ { print $2 }')
1337 [ "$STATUS" == "scanning-phase1" ] ||
1338 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1341 echo "Stop layout LFSCK on all targets by single lctl command."
1342 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1343 error "(10) Fail to stop LFSCK on all devices!"
1345 echo "All the LFSCK targets should be in 'stopped' status."
1346 for k in $(seq $MDSCOUNT); do
1347 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1348 mdd.$(facet_svc mds${k}).lfsck_layout |
1349 awk '/^status/ { print $2 }')
1350 [ "$STATUS" == "stopped" ] ||
1351 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1354 for k in $(seq $OSTCOUNT); do
1355 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1356 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1357 awk '/^status/ { print $2 }')
1358 [ "$STATUS" == "stopped" ] ||
1359 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1362 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1363 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1364 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1366 echo "All the LFSCK targets should be in 'completed' status."
1367 for k in $(seq $MDSCOUNT); do
1368 # The LFSCK status query internal is 30 seconds. For the case
1369 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1370 # time to guarantee the status sync up.
1371 wait_update_facet mds${k} "$LCTL get_param -n \
1372 mdd.$(facet_svc mds${k}).lfsck_layout |
1373 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1374 error "(14) MDS${k} is not the expected 'completed'"
1377 run_test 12 "single command to trigger LFSCK on all devices"
1381 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1382 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1383 echo "MDT-object FID."
1386 check_mount_and_prep
1388 echo "Inject failure stub to simulate bad lmm_oi"
1389 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1390 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1391 createmany -o $DIR/$tdir/f 32
1392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1394 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1395 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1397 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1398 mdd.${MDT_DEV}.lfsck_layout |
1399 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1401 error "(2) unexpected status"
1404 local repaired=$($SHOW_LAYOUT |
1405 awk '/^repaired_others/ { print $2 }')
1406 [ $repaired -eq 32 ] ||
1407 error "(3) Fail to repair crashed lmm_oi: $repaired"
1409 run_test 13 "LFSCK can repair crashed lmm_oi"
1413 echo "The OST-object referenced by the MDT-object should be there;"
1414 echo "otherwise, the LFSCK should re-create the missing OST-object."
1417 check_mount_and_prep
1418 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1420 local count=$(precreated_ost_obj_count 0 0)
1422 echo "Inject failure stub to simulate dangling referenced MDT-object"
1423 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1424 do_facet ost1 $LCTL set_param fail_loc=0x1610
1425 createmany -o $DIR/$tdir/f $((count + 31))
1426 touch $DIR/$tdir/guard
1427 do_facet ost1 $LCTL set_param fail_loc=0
1429 start_full_debug_logging
1431 # exhaust other pre-created dangling cases
1432 count=$(precreated_ost_obj_count 0 0)
1433 createmany -o $DIR/$tdir/a $count ||
1434 error "(0) Fail to create $count files."
1436 echo "'ls' should fail because of dangling referenced MDT-object"
1437 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1439 echo "Trigger layout LFSCK to find out dangling reference"
1440 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1442 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1443 mdd.${MDT_DEV}.lfsck_layout |
1444 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1446 error "(3) unexpected status"
1449 local repaired=$($SHOW_LAYOUT |
1450 awk '/^repaired_dangling/ { print $2 }')
1451 [ $repaired -ge 32 ] ||
1452 error "(4) Fail to repair dangling reference: $repaired"
1454 echo "'stat' should fail because of not repair dangling by default"
1455 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1457 echo "Trigger layout LFSCK to repair dangling reference"
1458 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1460 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1461 mdd.${MDT_DEV}.lfsck_layout |
1462 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1464 error "(7) unexpected status"
1467 # There may be some async LFSCK updates in processing, wait for
1468 # a while until the target reparation has been done. LU-4970.
1470 echo "'stat' should success after layout LFSCK repairing"
1471 wait_update_facet client "stat $DIR/$tdir/guard |
1472 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1473 stat $DIR/$tdir/guard
1475 error "(8) unexpected size"
1478 repaired=$($SHOW_LAYOUT |
1479 awk '/^repaired_dangling/ { print $2 }')
1480 [ $repaired -ge 32 ] ||
1481 error "(9) Fail to repair dangling reference: $repaired"
1483 stop_full_debug_logging
1485 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1489 echo "If the OST-object referenced by the MDT-object back points"
1490 echo "to some non-exist MDT-object, then the LFSCK should repair"
1491 echo "the OST-object to back point to the right MDT-object."
1494 check_mount_and_prep
1495 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1497 echo "Inject failure stub to make the OST-object to back point to"
1498 echo "non-exist MDT-object."
1499 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1501 do_facet ost1 $LCTL set_param fail_loc=0x1611
1502 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1503 cancel_lru_locks osc
1504 do_facet ost1 $LCTL set_param fail_loc=0
1506 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1507 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1509 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1510 mdd.${MDT_DEV}.lfsck_layout |
1511 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1513 error "(2) unexpected status"
1516 local repaired=$($SHOW_LAYOUT |
1517 awk '/^repaired_unmatched_pair/ { print $2 }')
1518 [ $repaired -eq 1 ] ||
1519 error "(3) Fail to repair unmatched pair: $repaired"
1521 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1525 echo "If the OST-object referenced by the MDT-object back points"
1526 echo "to other MDT-object that doesn't recognize the OST-object,"
1527 echo "then the LFSCK should repair it to back point to the right"
1528 echo "MDT-object (the first one)."
1531 check_mount_and_prep
1532 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1533 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1534 cancel_lru_locks osc
1536 echo "Inject failure stub to make the OST-object to back point to"
1537 echo "other MDT-object"
1539 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1540 do_facet ost1 $LCTL set_param fail_loc=0x1612
1541 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1542 cancel_lru_locks osc
1543 do_facet ost1 $LCTL set_param fail_loc=0
1545 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1546 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1548 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1549 mdd.${MDT_DEV}.lfsck_layout |
1550 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1552 error "(2) unexpected status"
1555 local repaired=$($SHOW_LAYOUT |
1556 awk '/^repaired_unmatched_pair/ { print $2 }')
1557 [ $repaired -eq 1 ] ||
1558 error "(3) Fail to repair unmatched pair: $repaired"
1560 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1564 echo "If the OST-object's owner information does not match the owner"
1565 echo "information stored in the MDT-object, then the LFSCK trust the"
1566 echo "MDT-object and update the OST-object's owner information."
1569 check_mount_and_prep
1570 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1571 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1572 cancel_lru_locks osc
1574 echo "Inject failure stub to skip OST-object owner changing"
1575 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1576 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1577 chown 1.1 $DIR/$tdir/f0
1578 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1580 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1583 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1585 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1586 mdd.${MDT_DEV}.lfsck_layout |
1587 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1589 error "(2) unexpected status"
1592 local repaired=$($SHOW_LAYOUT |
1593 awk '/^repaired_inconsistent_owner/ { print $2 }')
1594 [ $repaired -eq 1 ] ||
1595 error "(3) Fail to repair inconsistent owner: $repaired"
1597 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1601 echo "If more than one MDT-objects reference the same OST-object,"
1602 echo "and the OST-object only recognizes one MDT-object, then the"
1603 echo "LFSCK should create new OST-objects for such non-recognized"
1607 check_mount_and_prep
1608 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1610 echo "Inject failure stub to make two MDT-objects to refernce"
1611 echo "the OST-object"
1613 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1614 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1616 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1617 cancel_lru_locks osc
1619 createmany -o $DIR/$tdir/f 1
1621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1623 cancel_lru_locks mdc
1624 cancel_lru_locks osc
1626 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1627 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1628 [ $size -eq 1048576 ] ||
1629 error "(1) f0 (wrong) size should be 1048576, but got $size"
1631 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1634 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1636 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1637 mdd.${MDT_DEV}.lfsck_layout |
1638 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1640 error "(3) unexpected status"
1643 local repaired=$($SHOW_LAYOUT |
1644 awk '/^repaired_multiple_referenced/ { print $2 }')
1645 [ $repaired -eq 1 ] ||
1646 error "(4) Fail to repair multiple references: $repaired"
1648 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1649 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1650 error "(5) Fail to write f0."
1651 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1652 [ $size -eq 1048576 ] ||
1653 error "(6) guard size should be 1048576, but got $size"
1655 run_test 17 "LFSCK can repair multiple references"
1659 echo "The target MDT-object is there, but related stripe information"
1660 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1661 echo "layout EA entries."
1664 check_mount_and_prep
1665 $LFS mkdir -i 0 $DIR/$tdir/a1
1666 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1667 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1669 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1671 $LFS path2fid $DIR/$tdir/a1/f1
1672 $LFS getstripe $DIR/$tdir/a1/f1
1674 if [ $MDSCOUNT -ge 2 ]; then
1675 $LFS mkdir -i 1 $DIR/$tdir/a2
1676 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1677 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1678 $LFS path2fid $DIR/$tdir/a2/f2
1679 $LFS getstripe $DIR/$tdir/a2/f2
1682 cancel_lru_locks osc
1684 echo "Inject failure, to make the MDT-object lost its layout EA"
1685 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1686 do_facet mds1 $LCTL set_param fail_loc=0x1615
1687 chown 1.1 $DIR/$tdir/a1/f1
1689 if [ $MDSCOUNT -ge 2 ]; then
1690 do_facet mds2 $LCTL set_param fail_loc=0x1615
1691 chown 1.1 $DIR/$tdir/a2/f2
1697 do_facet mds1 $LCTL set_param fail_loc=0
1698 if [ $MDSCOUNT -ge 2 ]; then
1699 do_facet mds2 $LCTL set_param fail_loc=0
1702 cancel_lru_locks mdc
1703 cancel_lru_locks osc
1705 echo "The file size should be incorrect since layout EA is lost"
1706 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1707 [ "$cur_size" != "$saved_size" ] ||
1708 error "(1) Expect incorrect file1 size"
1710 if [ $MDSCOUNT -ge 2 ]; then
1711 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1712 [ "$cur_size" != "$saved_size" ] ||
1713 error "(2) Expect incorrect file2 size"
1716 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1717 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1719 for k in $(seq $MDSCOUNT); do
1720 # The LFSCK status query internal is 30 seconds. For the case
1721 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1722 # time to guarantee the status sync up.
1723 wait_update_facet mds${k} "$LCTL get_param -n \
1724 mdd.$(facet_svc mds${k}).lfsck_layout |
1725 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1726 error "(4) MDS${k} is not the expected 'completed'"
1729 for k in $(seq $OSTCOUNT); do
1730 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1731 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1732 awk '/^status/ { print $2 }')
1733 [ "$cur_status" == "completed" ] ||
1734 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1737 local repaired=$(do_facet mds1 $LCTL get_param -n \
1738 mdd.$(facet_svc mds1).lfsck_layout |
1739 awk '/^repaired_orphan/ { print $2 }')
1740 [ $repaired -eq 1 ] ||
1741 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1743 if [ $MDSCOUNT -ge 2 ]; then
1744 repaired=$(do_facet mds2 $LCTL get_param -n \
1745 mdd.$(facet_svc mds2).lfsck_layout |
1746 awk '/^repaired_orphan/ { print $2 }')
1747 [ $repaired -eq 2 ] ||
1748 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1751 $LFS path2fid $DIR/$tdir/a1/f1
1752 $LFS getstripe $DIR/$tdir/a1/f1
1754 if [ $MDSCOUNT -ge 2 ]; then
1755 $LFS path2fid $DIR/$tdir/a2/f2
1756 $LFS getstripe $DIR/$tdir/a2/f2
1759 echo "The file size should be correct after layout LFSCK scanning"
1760 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1761 [ "$cur_size" == "$saved_size" ] ||
1762 error "(7) Expect file1 size $saved_size, but got $cur_size"
1764 if [ $MDSCOUNT -ge 2 ]; then
1765 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1766 [ "$cur_size" == "$saved_size" ] ||
1767 error "(8) Expect file2 size $saved_size, but got $cur_size"
1770 run_test 18a "Find out orphan OST-object and repair it (1)"
1774 echo "The target MDT-object is lost. The LFSCK should re-create the"
1775 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1776 echo "can move it back to normal namespace manually."
1779 check_mount_and_prep
1780 $LFS mkdir -i 0 $DIR/$tdir/a1
1781 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1782 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1783 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1784 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1786 $LFS getstripe $DIR/$tdir/a1/f1
1788 if [ $MDSCOUNT -ge 2 ]; then
1789 $LFS mkdir -i 1 $DIR/$tdir/a2
1790 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1791 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1792 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1794 $LFS getstripe $DIR/$tdir/a2/f2
1797 cancel_lru_locks osc
1799 echo "Inject failure, to simulate the case of missing the MDT-object"
1800 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1801 do_facet mds1 $LCTL set_param fail_loc=0x1616
1802 rm -f $DIR/$tdir/a1/f1
1804 if [ $MDSCOUNT -ge 2 ]; then
1805 do_facet mds2 $LCTL set_param fail_loc=0x1616
1806 rm -f $DIR/$tdir/a2/f2
1812 do_facet mds1 $LCTL set_param fail_loc=0
1813 if [ $MDSCOUNT -ge 2 ]; then
1814 do_facet mds2 $LCTL set_param fail_loc=0
1817 cancel_lru_locks mdc
1818 cancel_lru_locks osc
1820 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1821 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1823 for k in $(seq $MDSCOUNT); do
1824 # The LFSCK status query internal is 30 seconds. For the case
1825 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1826 # time to guarantee the status sync up.
1827 wait_update_facet mds${k} "$LCTL get_param -n \
1828 mdd.$(facet_svc mds${k}).lfsck_layout |
1829 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1830 error "(2) MDS${k} is not the expected 'completed'"
1833 for k in $(seq $OSTCOUNT); do
1834 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1835 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1836 awk '/^status/ { print $2 }')
1837 [ "$cur_status" == "completed" ] ||
1838 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1841 local repaired=$(do_facet mds1 $LCTL get_param -n \
1842 mdd.$(facet_svc mds1).lfsck_layout |
1843 awk '/^repaired_orphan/ { print $2 }')
1844 [ $repaired -eq 1 ] ||
1845 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1847 if [ $MDSCOUNT -ge 2 ]; then
1848 repaired=$(do_facet mds2 $LCTL get_param -n \
1849 mdd.$(facet_svc mds2).lfsck_layout |
1850 awk '/^repaired_orphan/ { print $2 }')
1851 [ $repaired -eq 2 ] ||
1852 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1855 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1856 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1857 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1859 if [ $MDSCOUNT -ge 2 ]; then
1860 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1861 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1864 $LFS path2fid $DIR/$tdir/a1/f1
1865 $LFS getstripe $DIR/$tdir/a1/f1
1867 if [ $MDSCOUNT -ge 2 ]; then
1868 $LFS path2fid $DIR/$tdir/a2/f2
1869 $LFS getstripe $DIR/$tdir/a2/f2
1872 echo "The file size should be correct after layout LFSCK scanning"
1873 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1874 [ "$cur_size" == "$saved_size" ] ||
1875 error "(7) Expect file1 size $saved_size, but got $cur_size"
1877 if [ $MDSCOUNT -ge 2 ]; then
1878 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1879 [ "$cur_size" == "$saved_size" ] ||
1880 error "(8) Expect file2 size $saved_size, but got $cur_size"
1883 run_test 18b "Find out orphan OST-object and repair it (2)"
1887 echo "The target MDT-object is lost, and the OST-object FID is missing."
1888 echo "The LFSCK should re-create the MDT-object with new FID under the "
1889 echo "directory .lustre/lost+found/MDTxxxx."
1892 check_mount_and_prep
1893 $LFS mkdir -i 0 $DIR/$tdir/a1
1894 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1896 echo "Inject failure, to simulate the case of missing parent FID"
1897 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1898 do_facet ost1 $LCTL set_param fail_loc=0x1617
1900 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1901 $LFS getstripe $DIR/$tdir/a1/f1
1903 if [ $MDSCOUNT -ge 2 ]; then
1904 $LFS mkdir -i 1 $DIR/$tdir/a2
1905 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1906 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1907 $LFS getstripe $DIR/$tdir/a2/f2
1910 cancel_lru_locks osc
1912 echo "Inject failure, to simulate the case of missing the MDT-object"
1913 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1914 do_facet mds1 $LCTL set_param fail_loc=0x1616
1915 rm -f $DIR/$tdir/a1/f1
1917 if [ $MDSCOUNT -ge 2 ]; then
1918 do_facet mds2 $LCTL set_param fail_loc=0x1616
1919 rm -f $DIR/$tdir/a2/f2
1925 do_facet mds1 $LCTL set_param fail_loc=0
1926 if [ $MDSCOUNT -ge 2 ]; then
1927 do_facet mds2 $LCTL set_param fail_loc=0
1930 cancel_lru_locks mdc
1931 cancel_lru_locks osc
1933 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1934 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1936 for k in $(seq $MDSCOUNT); do
1937 # The LFSCK status query internal is 30 seconds. For the case
1938 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1939 # time to guarantee the status sync up.
1940 wait_update_facet mds${k} "$LCTL get_param -n \
1941 mdd.$(facet_svc mds${k}).lfsck_layout |
1942 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1943 error "(2) MDS${k} is not the expected 'completed'"
1946 for k in $(seq $OSTCOUNT); do
1947 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1948 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1949 awk '/^status/ { print $2 }')
1950 [ "$cur_status" == "completed" ] ||
1951 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1954 if [ $MDSCOUNT -ge 2 ]; then
1960 local repaired=$(do_facet mds1 $LCTL get_param -n \
1961 mdd.$(facet_svc mds1).lfsck_layout |
1962 awk '/^repaired_orphan/ { print $2 }')
1963 [ $repaired -eq $expected ] ||
1964 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1966 if [ $MDSCOUNT -ge 2 ]; then
1967 repaired=$(do_facet mds2 $LCTL get_param -n \
1968 mdd.$(facet_svc mds2).lfsck_layout |
1969 awk '/^repaired_orphan/ { print $2 }')
1970 [ $repaired -eq 0 ] ||
1971 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1974 ls -ail $MOUNT/.lustre/lost+found/
1976 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1977 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
1978 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
1980 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1983 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1984 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
1985 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
1987 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
1988 [ ! -z "$cname" ] ||
1989 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
1991 run_test 18c "Find out orphan OST-object and repair it (3)"
1995 echo "The target MDT-object layout EA slot is occpuied by some new"
1996 echo "created OST-object when repair dangling reference case. Such"
1997 echo "conflict OST-object has never been modified. Then when found"
1998 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2002 check_mount_and_prep
2004 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2005 echo "guard" > $DIR/$tdir/a1/f1
2006 echo "foo" > $DIR/$tdir/a1/f2
2007 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2008 $LFS path2fid $DIR/$tdir/a1/f1
2009 $LFS getstripe $DIR/$tdir/a1/f1
2010 $LFS path2fid $DIR/$tdir/a1/f2
2011 $LFS getstripe $DIR/$tdir/a1/f2
2012 cancel_lru_locks osc
2014 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2015 echo "to reference the same OST-object (which is f1's OST-obejct)."
2016 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2017 echo "dangling reference case, but f2's old OST-object is there."
2020 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2021 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2022 chown 1.1 $DIR/$tdir/a1/f2
2023 rm -f $DIR/$tdir/a1/f1
2026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2028 echo "stopall to cleanup object cache"
2031 setupall > /dev/null
2033 echo "The file size should be incorrect since dangling referenced"
2034 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2035 [ "$cur_size" != "$saved_size" ] ||
2036 error "(1) Expect incorrect file2 size"
2038 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2039 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2041 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2042 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2044 wait_update_facet mds1 "$LCTL get_param -n \
2045 mdd.$(facet_svc mds1).lfsck_layout |
2046 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2047 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2049 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2051 for k in $(seq $MDSCOUNT); do
2052 # The LFSCK status query internal is 30 seconds. For the case
2053 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2054 # time to guarantee the status sync up.
2055 wait_update_facet mds${k} "$LCTL get_param -n \
2056 mdd.$(facet_svc mds${k}).lfsck_layout |
2057 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2058 error "(3) MDS${k} is not the expected 'completed'"
2061 for k in $(seq $OSTCOUNT); do
2062 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2063 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2064 awk '/^status/ { print $2 }')
2065 [ "$cur_status" == "completed" ] ||
2066 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2069 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2070 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2071 awk '/^repaired_orphan/ { print $2 }')
2072 [ $repaired -eq 1 ] ||
2073 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2075 echo "The file size should be correct after layout LFSCK scanning"
2076 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2077 [ "$cur_size" == "$saved_size" ] ||
2078 error "(6) Expect file2 size $saved_size, but got $cur_size"
2080 echo "The LFSCK should find back the original data."
2081 cat $DIR/$tdir/a1/f2
2082 $LFS path2fid $DIR/$tdir/a1/f2
2083 $LFS getstripe $DIR/$tdir/a1/f2
2085 run_test 18d "Find out orphan OST-object and repair it (4)"
2089 echo "The target MDT-object layout EA slot is occpuied by some new"
2090 echo "created OST-object when repair dangling reference case. Such"
2091 echo "conflict OST-object has been modified by others. To keep the"
2092 echo "new data, the LFSCK will create a new file to refernece this"
2093 echo "old orphan OST-object."
2096 check_mount_and_prep
2098 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2099 echo "guard" > $DIR/$tdir/a1/f1
2100 echo "foo" > $DIR/$tdir/a1/f2
2101 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2102 $LFS path2fid $DIR/$tdir/a1/f1
2103 $LFS getstripe $DIR/$tdir/a1/f1
2104 $LFS path2fid $DIR/$tdir/a1/f2
2105 $LFS getstripe $DIR/$tdir/a1/f2
2106 cancel_lru_locks osc
2108 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2109 echo "to reference the same OST-object (which is f1's OST-obejct)."
2110 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2111 echo "dangling reference case, but f2's old OST-object is there."
2114 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2115 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2116 chown 1.1 $DIR/$tdir/a1/f2
2117 rm -f $DIR/$tdir/a1/f1
2120 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2122 echo "stopall to cleanup object cache"
2125 setupall > /dev/null
2127 echo "The file size should be incorrect since dangling referenced"
2128 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2129 [ "$cur_size" != "$saved_size" ] ||
2130 error "(1) Expect incorrect file2 size"
2132 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2133 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2135 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2136 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2138 wait_update_facet mds1 "$LCTL get_param -n \
2139 mdd.$(facet_svc mds1).lfsck_layout |
2140 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2141 error "(3) MDS1 is not the expected 'scanning-phase2'"
2143 # to guarantee all updates are synced.
2147 echo "Write new data to f2 to modify the new created OST-object."
2148 echo "dummy" >> $DIR/$tdir/a1/f2
2150 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2152 for k in $(seq $MDSCOUNT); do
2153 # The LFSCK status query internal is 30 seconds. For the case
2154 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2155 # time to guarantee the status sync up.
2156 wait_update_facet mds${k} "$LCTL get_param -n \
2157 mdd.$(facet_svc mds${k}).lfsck_layout |
2158 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2159 error "(4) MDS${k} is not the expected 'completed'"
2162 for k in $(seq $OSTCOUNT); do
2163 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2164 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2165 awk '/^status/ { print $2 }')
2166 [ "$cur_status" == "completed" ] ||
2167 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2170 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2171 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2172 awk '/^repaired_orphan/ { print $2 }')
2173 [ $repaired -eq 1 ] ||
2174 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2176 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2177 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2178 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2180 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2181 [ ! -z "$cname" ] ||
2182 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2184 echo "The stub file should keep the original f2 data"
2185 cur_size=$(ls -il $cname | awk '{ print $6 }')
2186 [ "$cur_size" == "$saved_size" ] ||
2187 error "(9) Expect file2 size $saved_size, but got $cur_size"
2190 $LFS path2fid $cname
2191 $LFS getstripe $cname
2193 echo "The f2 should contains new data."
2194 cat $DIR/$tdir/a1/f2
2195 $LFS path2fid $DIR/$tdir/a1/f2
2196 $LFS getstripe $DIR/$tdir/a1/f2
2198 run_test 18e "Find out orphan OST-object and repair it (5)"
2201 [ $OSTCOUNT -lt 2 ] &&
2202 skip "The test needs at least 2 OSTs" && return
2205 echo "The target MDT-object is lost. The LFSCK should re-create the"
2206 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2207 echo "to verify some OST-object(s) during the first stage-scanning,"
2208 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2209 echo "should not be affected."
2212 check_mount_and_prep
2213 $LFS mkdir -i 0 $DIR/$tdir/a1
2214 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2215 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2216 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2217 $LFS mkdir -i 0 $DIR/$tdir/a2
2218 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2219 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2220 $LFS getstripe $DIR/$tdir/a1/f1
2221 $LFS getstripe $DIR/$tdir/a2/f2
2223 if [ $MDSCOUNT -ge 2 ]; then
2224 $LFS mkdir -i 1 $DIR/$tdir/a3
2225 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2226 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2227 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2228 $LFS mkdir -i 1 $DIR/$tdir/a4
2229 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2230 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2231 $LFS getstripe $DIR/$tdir/a3/f3
2232 $LFS getstripe $DIR/$tdir/a4/f4
2235 cancel_lru_locks osc
2237 echo "Inject failure, to simulate the case of missing the MDT-object"
2238 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2239 do_facet mds1 $LCTL set_param fail_loc=0x1616
2240 rm -f $DIR/$tdir/a1/f1
2241 rm -f $DIR/$tdir/a2/f2
2243 if [ $MDSCOUNT -ge 2 ]; then
2244 do_facet mds2 $LCTL set_param fail_loc=0x1616
2245 rm -f $DIR/$tdir/a3/f3
2246 rm -f $DIR/$tdir/a4/f4
2252 do_facet mds1 $LCTL set_param fail_loc=0
2253 if [ $MDSCOUNT -ge 2 ]; then
2254 do_facet mds2 $LCTL set_param fail_loc=0
2257 cancel_lru_locks mdc
2258 cancel_lru_locks osc
2260 echo "Inject failure, to simulate the OST0 fail to handle"
2261 echo "MDT0 LFSCK request during the first-stage scanning."
2262 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2263 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2265 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2266 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2268 for k in $(seq $MDSCOUNT); do
2269 # The LFSCK status query internal is 30 seconds. For the case
2270 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2271 # time to guarantee the status sync up.
2272 wait_update_facet mds${k} "$LCTL get_param -n \
2273 mdd.$(facet_svc mds${k}).lfsck_layout |
2274 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2275 error "(2) MDS${k} is not the expected 'partial'"
2278 wait_update_facet ost1 "$LCTL get_param -n \
2279 obdfilter.$(facet_svc ost1).lfsck_layout |
2280 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2281 error "(3) OST1 is not the expected 'partial'"
2284 wait_update_facet ost2 "$LCTL get_param -n \
2285 obdfilter.$(facet_svc ost2).lfsck_layout |
2286 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2287 error "(4) OST2 is not the expected 'completed'"
2290 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2292 local repaired=$(do_facet mds1 $LCTL get_param -n \
2293 mdd.$(facet_svc mds1).lfsck_layout |
2294 awk '/^repaired_orphan/ { print $2 }')
2295 [ $repaired -eq 1 ] ||
2296 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2298 if [ $MDSCOUNT -ge 2 ]; then
2299 repaired=$(do_facet mds2 $LCTL get_param -n \
2300 mdd.$(facet_svc mds2).lfsck_layout |
2301 awk '/^repaired_orphan/ { print $2 }')
2302 [ $repaired -eq 1 ] ||
2303 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2306 echo "Trigger layout LFSCK on all devices again to cleanup"
2307 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2309 for k in $(seq $MDSCOUNT); do
2310 # The LFSCK status query internal is 30 seconds. For the case
2311 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2312 # time to guarantee the status sync up.
2313 wait_update_facet mds${k} "$LCTL get_param -n \
2314 mdd.$(facet_svc mds${k}).lfsck_layout |
2315 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2316 error "(8) MDS${k} is not the expected 'completed'"
2319 for k in $(seq $OSTCOUNT); do
2320 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2321 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2322 awk '/^status/ { print $2 }')
2323 [ "$cur_status" == "completed" ] ||
2324 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2328 local repaired=$(do_facet mds1 $LCTL get_param -n \
2329 mdd.$(facet_svc mds1).lfsck_layout |
2330 awk '/^repaired_orphan/ { print $2 }')
2331 [ $repaired -eq 2 ] ||
2332 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2334 if [ $MDSCOUNT -ge 2 ]; then
2335 repaired=$(do_facet mds2 $LCTL get_param -n \
2336 mdd.$(facet_svc mds2).lfsck_layout |
2337 awk '/^repaired_orphan/ { print $2 }')
2338 [ $repaired -eq 2 ] ||
2339 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2342 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2345 check_mount_and_prep
2346 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2348 echo "foo" > $DIR/$tdir/a0
2349 echo "guard" > $DIR/$tdir/a1
2350 cancel_lru_locks osc
2352 echo "Inject failure, then client will offer wrong parent FID when read"
2353 do_facet ost1 $LCTL set_param -n \
2354 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2355 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2356 $LCTL set_param fail_loc=0x1619
2358 echo "Read RPC with wrong parent FID should be denied"
2359 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2360 $LCTL set_param fail_loc=0
2362 run_test 19a "OST-object inconsistency self detect"
2365 check_mount_and_prep
2366 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2368 echo "Inject failure stub to make the OST-object to back point to"
2369 echo "non-exist MDT-object"
2371 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2372 do_facet ost1 $LCTL set_param fail_loc=0x1611
2373 echo "foo" > $DIR/$tdir/f0
2374 cancel_lru_locks osc
2375 do_facet ost1 $LCTL set_param fail_loc=0
2377 echo "Nothing should be fixed since self detect and repair is disabled"
2378 local repaired=$(do_facet ost1 $LCTL get_param -n \
2379 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2380 awk '/^repaired/ { print $2 }')
2381 [ $repaired -eq 0 ] ||
2382 error "(1) Expected 0 repaired, but got $repaired"
2384 echo "Read RPC with right parent FID should be accepted,"
2385 echo "and cause parent FID on OST to be fixed"
2387 do_facet ost1 $LCTL set_param -n \
2388 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2389 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2391 repaired=$(do_facet ost1 $LCTL get_param -n \
2392 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2393 awk '/^repaired/ { print $2 }')
2394 [ $repaired -eq 1 ] ||
2395 error "(3) Expected 1 repaired, but got $repaired"
2397 run_test 19b "OST-object inconsistency self repair"
2400 [ $OSTCOUNT -lt 2 ] &&
2401 skip "The test needs at least 2 OSTs" && return
2404 echo "The target MDT-object and some of its OST-object are lost."
2405 echo "The LFSCK should find out the left OST-objects and re-create"
2406 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2407 echo "with the partial OST-objects (LOV EA hole)."
2409 echo "New client can access the file with LOV EA hole via normal"
2410 echo "system tools or commands without crash the system."
2412 echo "For old client, even though it cannot access the file with"
2413 echo "LOV EA hole, it should not cause the system crash."
2416 check_mount_and_prep
2417 $LFS mkdir -i 0 $DIR/$tdir/a1
2418 if [ $OSTCOUNT -gt 2 ]; then
2419 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2422 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2426 # 256 blocks on the stripe0.
2427 # 1 block on the stripe1 for 2 OSTs case.
2428 # 256 blocks on the stripe1 for other cases.
2429 # 1 block on the stripe2 if OSTs > 2
2430 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2431 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2432 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2434 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2435 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2436 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2439 $LFS getstripe $DIR/$tdir/a1/f0
2441 $LFS getstripe $DIR/$tdir/a1/f1
2443 $LFS getstripe $DIR/$tdir/a1/f2
2445 if [ $OSTCOUNT -gt 2 ]; then
2446 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2447 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2449 $LFS getstripe $DIR/$tdir/a1/f3
2452 cancel_lru_locks osc
2454 echo "Inject failure..."
2455 echo "To simulate f0 lost MDT-object"
2456 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2457 do_facet mds1 $LCTL set_param fail_loc=0x1616
2458 rm -f $DIR/$tdir/a1/f0
2460 echo "To simulate f1 lost MDT-object and OST-object0"
2461 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2462 do_facet mds1 $LCTL set_param fail_loc=0x161a
2463 rm -f $DIR/$tdir/a1/f1
2465 echo "To simulate f2 lost MDT-object and OST-object1"
2466 do_facet mds1 $LCTL set_param fail_val=1
2467 rm -f $DIR/$tdir/a1/f2
2469 if [ $OSTCOUNT -gt 2 ]; then
2470 echo "To simulate f3 lost MDT-object and OST-object2"
2471 do_facet mds1 $LCTL set_param fail_val=2
2472 rm -f $DIR/$tdir/a1/f3
2475 umount_client $MOUNT
2478 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2480 echo "Inject failure to slow down the LFSCK on OST0"
2481 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2482 do_facet ost1 $LCTL set_param fail_loc=0x161b
2484 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2485 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2488 do_facet ost1 $LCTL set_param fail_loc=0
2490 for k in $(seq $MDSCOUNT); do
2491 # The LFSCK status query internal is 30 seconds. For the case
2492 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2493 # time to guarantee the status sync up.
2494 wait_update_facet mds${k} "$LCTL get_param -n \
2495 mdd.$(facet_svc mds${k}).lfsck_layout |
2496 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2497 error "(2) MDS${k} is not the expected 'completed'"
2500 for k in $(seq $OSTCOUNT); do
2501 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2502 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2503 awk '/^status/ { print $2 }')
2504 [ "$cur_status" == "completed" ] ||
2505 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2508 local repaired=$(do_facet mds1 $LCTL get_param -n \
2509 mdd.$(facet_svc mds1).lfsck_layout |
2510 awk '/^repaired_orphan/ { print $2 }')
2511 if [ $OSTCOUNT -gt 2 ]; then
2512 [ $repaired -eq 9 ] ||
2513 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2515 [ $repaired -eq 4 ] ||
2516 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2519 mount_client $MOUNT || error "(5.0) Fail to start client!"
2521 LOV_PATTERN_F_HOLE=0x40000000
2524 # ${fid0}-R-0 is the old f0
2526 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2527 echo "Check $name, which is the old f0"
2529 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2531 local pattern=0x$($LFS getstripe -L $name)
2532 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2533 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2535 local stripes=$($LFS getstripe -c $name)
2536 if [ $OSTCOUNT -gt 2 ]; then
2537 [ $stripes -eq 3 ] ||
2538 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2540 [ $stripes -eq 2 ] ||
2541 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2544 local size=$(stat $name | awk '/Size:/ { print $2 }')
2545 [ $size -eq $((4096 * $bcount)) ] ||
2546 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2548 cat $name > /dev/null || error "(5.5) cannot read $name"
2550 echo "dummy" >> $name || error "(5.6) cannot write $name"
2552 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2554 touch $name || error "(5.8) cannot touch $name"
2556 rm -f $name || error "(5.9) cannot unlink $name"
2559 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2561 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2562 if [ $OSTCOUNT -gt 2 ]; then
2563 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2565 echo "Check $name, it contains the old f1's stripe1"
2568 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2570 pattern=0x$($LFS getstripe -L $name)
2571 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2572 error "(6.2) expect pattern flag hole, but got $pattern"
2574 stripes=$($LFS getstripe -c $name)
2575 if [ $OSTCOUNT -gt 2 ]; then
2576 [ $stripes -eq 3 ] ||
2577 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2579 [ $stripes -eq 2 ] ||
2580 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2583 size=$(stat $name | awk '/Size:/ { print $2 }')
2584 [ $size -eq $((4096 * $bcount)) ] ||
2585 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2587 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2589 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2590 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2593 [ $failures -eq 256 ] ||
2594 error "(6.6) expect 256 IO failures, but get $failures"
2596 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2597 [ $size -eq $((4096 * $bcount)) ] ||
2598 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2600 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2601 error "(6.8) write to the LOV EA hole should fail"
2603 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2604 error "(6.9) write to normal stripe should NOT fail"
2606 echo "foo" >> $name && error "(6.10) append write $name should fail"
2608 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2610 touch $name || error "(6.12) cannot touch $name"
2612 rm -f $name || error "(6.13) cannot unlink $name"
2615 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2617 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2618 if [ $OSTCOUNT -gt 2 ]; then
2619 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2621 echo "Check $name, it contains the old f2's stripe0"
2624 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2626 pattern=0x$($LFS getstripe -L $name)
2627 stripes=$($LFS getstripe -c $name)
2628 size=$(stat $name | awk '/Size:/ { print $2 }')
2629 if [ $OSTCOUNT -gt 2 ]; then
2630 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2631 error "(7.2.1) expect pattern flag hole, but got $pattern"
2633 [ $stripes -eq 3 ] ||
2634 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2636 [ $size -eq $((4096 * $bcount)) ] ||
2637 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2639 cat $name > /dev/null &&
2640 error "(7.5.1) normal read $name should fail"
2642 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2643 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2645 [ $failures -eq 256 ] ||
2646 error "(7.6) expect 256 IO failures, but get $failures"
2648 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2649 [ $size -eq $((4096 * $bcount)) ] ||
2650 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2652 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2653 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2655 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2656 error "(7.8.1) write to normal stripe should NOT fail"
2658 echo "foo" >> $name &&
2659 error "(7.8.3) append write $name should fail"
2661 chown $RUNAS_ID:$RUNAS_GID $name ||
2662 error "(7.9.1) cannot chown on $name"
2664 touch $name || error "(7.10.1) cannot touch $name"
2666 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2667 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2669 [ $stripes -eq 1 ] ||
2670 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2673 [ $size -eq $((4096 * (256 + 0))) ] ||
2674 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2676 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2678 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2680 chown $RUNAS_ID:$RUNAS_GID $name ||
2681 error "(7.9.2) cannot chown on $name"
2683 touch $name || error "(7.10.2) cannot touch $name"
2686 rm -f $name || error "(7.11) cannot unlink $name"
2688 [ $OSTCOUNT -le 2 ] && return
2691 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2693 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2694 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2696 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2698 pattern=0x$($LFS getstripe -L $name)
2699 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2700 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2702 stripes=$($LFS getstripe -c $name)
2703 # LFSCK does not know the old f3 had 3 stripes.
2704 # It only tries to find as much as possible.
2705 # The stripe count depends on the last stripe's offset.
2706 [ $stripes -eq 2 ] ||
2707 error "(8.3) expect the stripe count is 2, but got $stripes"
2709 size=$(stat $name | awk '/Size:/ { print $2 }')
2711 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2712 error "(8.4) expect the size $((4096 * 512)), but got $size"
2714 cat $name > /dev/null || error "(8.5) cannot read $name"
2716 echo "dummy" >> $name || error "(8.6) cannot write $name"
2718 chown $RUNAS_ID:$RUNAS_GID $name ||
2719 error "(8.7) cannot chown on $name"
2721 touch $name || error "(8.8) cannot touch $name"
2723 rm -f $name || error "(8.9) cannot unlink $name"
2725 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2728 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2729 skip "ignore the test if MDS is older than 2.5.59" && return
2731 check_mount_and_prep
2732 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2734 echo "Start all LFSCK components by default (-s 1)"
2735 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2736 error "Fail to start LFSCK"
2738 echo "namespace LFSCK should be in 'scanning-phase1' status"
2739 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2740 [ "$STATUS" == "scanning-phase1" ] ||
2741 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2743 echo "layout LFSCK should be in 'scanning-phase1' status"
2744 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2745 [ "$STATUS" == "scanning-phase1" ] ||
2746 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2748 echo "Stop all LFSCK components by default"
2749 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2750 error "Fail to stop LFSCK"
2752 run_test 21 "run all LFSCK components by default"
2755 [ $MDSCOUNT -lt 2 ] &&
2756 skip "We need at least 2 MDSes for this test" && return
2759 echo "The parent_A references the child directory via some name entry,"
2760 echo "but the child directory back references another parent_B via its"
2761 echo "".." name entry. The parent_A does not exist. Then the namesapce"
2762 echo "LFSCK will repair the child directory's ".." name entry."
2765 check_mount_and_prep
2767 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2768 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2770 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2771 echo "The dummy's dotdot name entry references the guard."
2772 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2773 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2774 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2775 error "(3) Fail to mkdir on MDT0"
2776 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2778 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2780 echo "Trigger namespace LFSCK to repair unmatched pairs"
2781 $START_NAMESPACE -A -r ||
2782 error "(5) Fail to start LFSCK for namespace"
2784 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2785 mdd.${MDT_DEV}.lfsck_namespace |
2786 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2788 error "(6) unexpected status"
2791 local repaired=$($SHOW_NAMESPACE |
2792 awk '/^unmatched_pairs_repaired/ { print $2 }')
2793 [ $repaired -eq 1 ] ||
2794 error "(7) Fail to repair unmatched pairs: $repaired"
2796 echo "'ls' should success after namespace LFSCK repairing"
2797 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2798 error "(8) ls should success."
2800 run_test 22a "LFSCK can repair unmatched pairs (1)"
2803 [ $MDSCOUNT -lt 2 ] &&
2804 skip "We need at least 2 MDSes for this test" && return
2807 echo "The parent_A references the child directory via the name entry_B,"
2808 echo "but the child directory back references another parent_C via its"
2809 echo "".." name entry. The parent_C exists, but there is no the name"
2810 echo "entry_B under the parent_B. Then the namesapce LFSCK will repair"
2811 echo "the child directory's ".." name entry and its linkEA."
2814 check_mount_and_prep
2816 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2817 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2819 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2820 echo "and bad linkEA. The dummy's dotdot name entry references the"
2821 echo "guard. The dummy's linkEA references n non-exist name entry."
2822 #define OBD_FAIL_LFSCK_BAD_PARENT2 0x161f
2823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161f
2824 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2825 error "(3) Fail to mkdir on MDT0"
2826 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2828 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2829 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2830 local dummyname=$($LFS fid2path $DIR $dummyfid)
2831 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2832 error "(4) fid2path works unexpectedly."
2834 echo "Trigger namespace LFSCK to repair unmatched pairs"
2835 $START_NAMESPACE -A -r ||
2836 error "(5) Fail to start LFSCK for namespace"
2838 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2839 mdd.${MDT_DEV}.lfsck_namespace |
2840 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2842 error "(6) unexpected status"
2845 local repaired=$($SHOW_NAMESPACE |
2846 awk '/^unmatched_pairs_repaired/ { print $2 }')
2847 [ $repaired -eq 1 ] ||
2848 error "(7) Fail to repair unmatched pairs: $repaired"
2850 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2851 local dummyname=$($LFS fid2path $DIR $dummyfid)
2852 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2853 error "(8) fid2path does not work"
2855 run_test 22b "LFSCK can repair unmatched pairs (2)"
2858 [ $MDSCOUNT -lt 2 ] &&
2859 skip "We need at least 2 MDSes for this test" && return
2862 echo "The name entry is there, but the MDT-object for such name "
2863 echo "entry does not exist. The namespace LFSCK should find out "
2864 echo "and repair the inconsistency as required."
2867 check_mount_and_prep
2869 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2870 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2872 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2873 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2874 do_facet mds2 $LCTL set_param fail_loc=0x1620
2875 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2876 do_facet mds2 $LCTL set_param fail_loc=0
2878 echo "'ls' should fail because of dangling name entry"
2879 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2881 echo "Trigger namespace LFSCK to find out dangling name entry"
2882 $START_NAMESPACE -A -r ||
2883 error "(5) Fail to start LFSCK for namespace"
2885 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2886 mdd.${MDT_DEV}.lfsck_namespace |
2887 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2889 error "(6) unexpected status"
2892 local repaired=$($SHOW_NAMESPACE |
2893 awk '/^dangling_repaired/ { print $2 }')
2894 [ $repaired -eq 1 ] ||
2895 error "(7) Fail to repair dangling name entry: $repaired"
2897 echo "'ls' should fail because not re-create MDT-object by default"
2898 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2900 echo "Trigger namespace LFSCK again to repair dangling name entry"
2901 $START_NAMESPACE -A -r -C ||
2902 error "(9) Fail to start LFSCK for namespace"
2904 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2905 mdd.${MDT_DEV}.lfsck_namespace |
2906 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2908 error "(10) unexpected status"
2911 repaired=$($SHOW_NAMESPACE |
2912 awk '/^dangling_repaired/ { print $2 }')
2913 [ $repaired -eq 1 ] ||
2914 error "(11) Fail to repair dangling name entry: $repaired"
2916 echo "'ls' should success after namespace LFSCK repairing"
2917 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
2919 run_test 23a "LFSCK can repair dangling name entry (1)"
2923 echo "The objectA has multiple hard links, one of them corresponding"
2924 echo "to the name entry_B. But there is something wrong for the name"
2925 echo "entry_B and cause entry_B to references non-exist object_C."
2926 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2927 echo "as dangling, and re-create the lost object_C. When the LFSCK"
2928 echo "comes to the second-stage scanning, it will find that the"
2929 echo "former re-creating object_C is not proper, and will try to"
2930 echo "replace the object_C with the real object_A."
2933 check_mount_and_prep
2935 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2936 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2937 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2939 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2940 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2942 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2943 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2945 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2947 echo "'ls' should fail because of dangling name entry"
2948 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2949 error "(6) ls should fail."
2951 echo "Trigger namespace LFSCK to find out dangling name entry"
2952 $START_NAMESPACE -r -C ||
2953 error "(7) Fail to start LFSCK for namespace"
2955 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2956 mdd.${MDT_DEV}.lfsck_namespace |
2957 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2959 error "(8) unexpected status"
2962 local repaired=$($SHOW_NAMESPACE |
2963 awk '/^dangling_repaired/ { print $2 }')
2964 [ $repaired -eq 1 ] ||
2965 error "(9) Fail to repair dangling name entry: $repaired"
2967 repaired=$($SHOW_NAMESPACE |
2968 awk '/^multiple_linked_repaired/ { print $2 }')
2969 [ $repaired -eq 1 ] ||
2970 error "(10) Fail to drop the former created object: $repaired"
2972 local data=$(cat $DIR/$tdir/d0/foo)
2973 [ "$data" == "dummy" ] ||
2974 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
2976 run_test 23b "LFSCK can repair dangling name entry (2)"
2980 echo "The objectA has multiple hard links, one of them corresponding"
2981 echo "to the name entry_B. But there is something wrong for the name"
2982 echo "entry_B and cause entry_B to references non-exist object_C."
2983 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2984 echo "as dangling, and re-create the lost object_C. And then others"
2985 echo "modified the re-created object_C. When the LFSCK comes to the"
2986 echo "second-stage scanning, it will find that the former re-creating"
2987 echo "object_C maybe wrong and try to replace the object_C with the"
2988 echo "real object_A. But because object_C has been modified, so the"
2989 echo "LFSCK cannot replace it."
2992 check_mount_and_prep
2994 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2995 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2996 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2998 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2999 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3000 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3001 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3002 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3004 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3006 echo "'ls' should fail because of dangling name entry"
3007 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3008 error "(6) ls should fail."
3010 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3011 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3013 echo "Trigger namespace LFSCK to find out dangling name entry"
3014 $START_NAMESPACE -r -C ||
3015 error "(7) Fail to start LFSCK for namespace"
3017 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3018 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3019 stat $DIR/$tdir/guard
3021 error "(8) unexpected size"
3024 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3025 cancel_lru_locks osc
3027 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3028 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3029 mdd.${MDT_DEV}.lfsck_namespace |
3030 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3032 error "(10) unexpected status"
3035 local repaired=$($SHOW_NAMESPACE |
3036 awk '/^dangling_repaired/ { print $2 }')
3037 [ $repaired -eq 1 ] ||
3038 error "(11) Fail to repair dangling name entry: $repaired"
3040 local data=$(cat $DIR/$tdir/d0/foo)
3041 [ "$data" != "dummy" ] ||
3042 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3044 run_test 23c "LFSCK can repair dangling name entry (3)"
3047 [ $MDSCOUNT -lt 2 ] &&
3048 skip "We need at least 2 MDSes for this test" && return
3051 echo "Two MDT-objects back reference the same name entry via their"
3052 echo "each own linkEA entry, but the name entry only references one"
3053 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3054 echo "for the MDT-object that is not recognized. If such MDT-object"
3055 echo "has no other linkEA entry after the removing, then the LFSCK"
3056 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3059 check_mount_and_prep
3061 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3063 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3064 $LFS path2fid $DIR/$tdir/d0/guard
3066 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3067 $LFS path2fid $DIR/$tdir/d0/dummy
3068 local pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3070 touch $DIR/$tdir/d0/guard/foo ||
3071 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3073 echo "Inject failure stub on MDT0 to simulate the case that"
3074 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3075 echo "that references $DIR/$tdir/d0/guard/foo."
3076 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3077 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3078 echo "there with the same linkEA entry as another MDT-object"
3079 echo "$DIR/$tdir/d0/guard/foo has"
3081 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3082 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3083 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3084 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3085 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3086 rmdir $DIR/$tdir/d0/dummy/foo ||
3087 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3088 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3090 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3091 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3092 error "(6) stat successfully unexpectedly"
3094 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3095 $START_NAMESPACE -A -r ||
3096 error "(7) Fail to start LFSCK for namespace"
3098 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3099 mdd.${MDT_DEV}.lfsck_namespace |
3100 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3102 error "(8) unexpected status"
3105 local repaired=$($SHOW_NAMESPACE |
3106 awk '/^multiple_referenced_repaired/ { print $2 }')
3107 [ $repaired -eq 1 ] ||
3108 error "(9) Fail to repair multiple referenced name entry: $repaired"
3110 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3111 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3112 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3114 local cname="$cfid-$pfid-D-0"
3115 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3116 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3118 run_test 24 "LFSCK can repair multiple-referenced name entry"
3121 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3122 skip "Only support to inject failure on ldiskfs" && return
3125 echo "The file type in the name entry does not match the file type"
3126 echo "claimed by the referenced object. Then the LFSCK will update"
3127 echo "the file type in the name entry."
3130 check_mount_and_prep
3132 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3134 echo "Inject failure stub on MDT0 to simulate the case that"
3135 echo "the file type stored in the name entry is wrong."
3137 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3139 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3140 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3142 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3143 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3145 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3146 mdd.${MDT_DEV}.lfsck_namespace |
3147 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3149 error "(4) unexpected status"
3152 local repaired=$($SHOW_NAMESPACE |
3153 awk '/^bad_file_type_repaired/ { print $2 }')
3154 [ $repaired -eq 1 ] ||
3155 error "(5) Fail to repair bad file type in name entry: $repaired"
3157 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3159 run_test 25 "LFSCK can repair bad file type in the name entry"
3163 echo "The local name entry back referenced by the MDT-object is lost."
3164 echo "The namespace LFSCK will add the missing local name entry back"
3165 echo "to the normal namespace."
3168 check_mount_and_prep
3170 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3171 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3172 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3174 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3175 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3177 echo "Inject failure stub on MDT0 to simulate the case that"
3178 echo "foo's name entry will be removed, but the foo's object"
3179 echo "and its linkEA are kept in the system."
3181 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3182 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3183 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3186 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3188 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3189 $START_NAMESPACE -r -A ||
3190 error "(6) Fail to start LFSCK for namespace"
3192 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3193 mdd.${MDT_DEV}.lfsck_namespace |
3194 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3196 error "(7) unexpected status"
3199 local repaired=$($SHOW_NAMESPACE |
3200 awk '/^lost_dirent_repaired/ { print $2 }')
3201 [ $repaired -eq 1 ] ||
3202 error "(8) Fail to repair lost dirent: $repaired"
3204 ls -ail $DIR/$tdir/d0/foo ||
3205 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3207 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3208 [ "$foofid" == "$foofid2" ] ||
3209 error "(10) foo's FID changed: $foofid, $foofid2"
3211 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3214 [ $MDSCOUNT -lt 2 ] &&
3215 skip "We need at least 2 MDSes for this test" && return
3218 echo "The remote name entry back referenced by the MDT-object is lost."
3219 echo "The namespace LFSCK will add the missing remote name entry back"
3220 echo "to the normal namespace."
3223 check_mount_and_prep
3225 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3226 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3227 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3229 echo "Inject failure stub on MDT0 to simulate the case that"
3230 echo "foo's name entry will be removed, but the foo's object"
3231 echo "and its linkEA are kept in the system."
3233 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3235 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3236 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3238 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3240 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3241 $START_NAMESPACE -r -A ||
3242 error "(5) Fail to start LFSCK for namespace"
3244 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3245 mdd.${MDT_DEV}.lfsck_namespace |
3246 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3248 error "(6) unexpected status"
3251 local repaired=$($SHOW_NAMESPACE |
3252 awk '/^lost_dirent_repaired/ { print $2 }')
3253 [ $repaired -eq 1 ] ||
3254 error "(7) Fail to repair lost dirent: $repaired"
3256 ls -ail $DIR/$tdir/d0/foo ||
3257 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3259 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3260 [ "$foofid" == "$foofid2" ] ||
3261 error "(9) foo's FID changed: $foofid, $foofid2"
3263 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3267 echo "The local parent referenced by the MDT-object linkEA is lost."
3268 echo "The namespace LFSCK will re-create the lost parent as orphan."
3271 check_mount_and_prep
3273 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3274 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3275 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3276 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3278 echo "Inject failure stub on MDT0 to simulate the case that"
3279 echo "foo's name entry will be removed, but the foo's object"
3280 echo "and its linkEA are kept in the system. And then remove"
3281 echo "another hard link and the parent directory."
3283 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3285 rm -f $DIR/$tdir/d0/foo ||
3286 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3287 rm -f $DIR/$tdir/d0/dummy ||
3288 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3291 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3292 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3294 echo "Trigger namespace LFSCK to repair the lost parent"
3295 $START_NAMESPACE -r -A ||
3296 error "(6) Fail to start LFSCK for namespace"
3298 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3299 mdd.${MDT_DEV}.lfsck_namespace |
3300 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3302 error "(7) unexpected status"
3305 local repaired=$($SHOW_NAMESPACE |
3306 awk '/^lost_dirent_repaired/ { print $2 }')
3307 [ $repaired -eq 1 ] ||
3308 error "(8) Fail to repair lost dirent: $repaired"
3310 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3311 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3312 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3314 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3316 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3317 [ ! -z "$cname" ] ||
3318 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3320 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3323 [ $MDSCOUNT -lt 2 ] &&
3324 skip "We need at least 2 MDSes for this test" && return
3327 echo "The remote parent referenced by the MDT-object linkEA is lost."
3328 echo "The namespace LFSCK will re-create the lost parent as orphan."
3331 check_mount_and_prep
3333 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3334 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3336 $LFS path2fid $DIR/$tdir/d0
3338 echo "Inject failure stub on MDT0 to simulate the case that"
3339 echo "foo's name entry will be removed, but the foo's object"
3340 echo "and its linkEA are kept in the system. And then remove"
3341 echo "the parent directory."
3343 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3345 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3348 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3349 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3351 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3352 $START_NAMESPACE -r -A ||
3353 error "(6) Fail to start LFSCK for namespace"
3355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3356 mdd.${MDT_DEV}.lfsck_namespace |
3357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3359 error "(7) unexpected status"
3362 local repaired=$($SHOW_NAMESPACE |
3363 awk '/^lost_dirent_repaired/ { print $2 }')
3364 [ $repaired -eq 1 ] ||
3365 error "(8) Fail to repair lost dirent: $repaired"
3367 ls -ail $MOUNT/.lustre/lost+found/
3369 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3370 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3371 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3373 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3375 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3376 [ ! -z "$cname" ] ||
3377 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3379 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3382 [ $MDSCOUNT -lt 2 ] &&
3383 skip "The test needs at least 2 MDTs" && return
3386 echo "The target name entry is lost. The LFSCK should insert the"
3387 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3388 echo "the MDT (on which the orphan MDT-object resides) has ever"
3389 echo "failed to respond some name entry verification during the"
3390 echo "first stage-scanning, then the LFSCK should skip to handle"
3391 echo "orphan MDT-object on this MDT. But other MDTs should not"
3395 check_mount_and_prep
3396 $LFS mkdir -i 0 $DIR/$tdir/d1
3397 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3398 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3400 $LFS mkdir -i 1 $DIR/$tdir/d2
3401 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3402 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3404 echo "Inject failure stub on MDT0 to simulate the case that"
3405 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3406 echo "and its linkEA are kept in the system. And the case that"
3407 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3408 echo "and its linkEA are kept in the system."
3410 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3411 do_facet mds1 $LCTL set_param fail_loc=0x1624
3412 do_facet mds2 $LCTL set_param fail_loc=0x1624
3413 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3414 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3415 do_facet mds1 $LCTL set_param fail_loc=0
3416 do_facet mds2 $LCTL set_param fail_loc=0
3418 cancel_lru_locks mdc
3419 cancel_lru_locks osc
3421 echo "Inject failure, to simulate the MDT0 fail to handle"
3422 echo "MDT1 LFSCK request during the first-stage scanning."
3423 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3424 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3426 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3427 $START_NAMESPACE -r -A ||
3428 error "(3) Fail to start LFSCK for namespace"
3430 wait_update_facet mds1 "$LCTL get_param -n \
3431 mdd.$(facet_svc mds1).lfsck_namespace |
3432 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3433 error "(4) mds1 is not the expected 'partial'"
3436 wait_update_facet mds2 "$LCTL get_param -n \
3437 mdd.$(facet_svc mds2).lfsck_namespace |
3438 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3439 error "(5) mds2 is not the expected 'completed'"
3442 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3444 local repaired=$(do_facet mds1 $LCTL get_param -n \
3445 mdd.$(facet_svc mds1).lfsck_namespace |
3446 awk '/^lost_dirent_repaired/ { print $2 }')
3447 [ $repaired -eq 0 ] ||
3448 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3450 repaired=$(do_facet mds2 $LCTL get_param -n \
3451 mdd.$(facet_svc mds2).lfsck_namespace |
3452 awk '/^lost_dirent_repaired/ { print $2 }')
3453 [ $repaired -eq 1 ] ||
3454 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3456 echo "Trigger namespace LFSCK on all devices again to cleanup"
3457 $START_NAMESPACE -r -A ||
3458 error "(8) Fail to start LFSCK for namespace"
3460 for k in $(seq $MDSCOUNT); do
3461 # The LFSCK status query internal is 30 seconds. For the case
3462 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3463 # time to guarantee the status sync up.
3464 wait_update_facet mds${k} "$LCTL get_param -n \
3465 mdd.$(facet_svc mds${k}).lfsck_namespace |
3466 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3467 error "(9) MDS${k} is not the expected 'completed'"
3470 local repaired=$(do_facet mds1 $LCTL get_param -n \
3471 mdd.$(facet_svc mds1).lfsck_namespace |
3472 awk '/^lost_dirent_repaired/ { print $2 }')
3473 [ $repaired -eq 1 ] ||
3474 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3476 repaired=$(do_facet mds2 $LCTL get_param -n \
3477 mdd.$(facet_svc mds2).lfsck_namespace |
3478 awk '/^lost_dirent_repaired/ { print $2 }')
3479 [ $repaired -eq 0 ] ||
3480 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3482 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3486 echo "The object's nlink attribute is larger than the object's known"
3487 echo "name entries count. The LFSCK will repair the object's nlink"
3488 echo "attribute to match the known name entries count"
3491 check_mount_and_prep
3493 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3494 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3496 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3497 echo "nlink attribute is larger than its name entries count."
3499 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3501 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3502 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3505 cancel_lru_locks mdc
3506 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3507 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3509 echo "Trigger namespace LFSCK to repair the nlink count"
3510 $START_NAMESPACE -r -A ||
3511 error "(5) Fail to start LFSCK for namespace"
3513 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3514 mdd.${MDT_DEV}.lfsck_namespace |
3515 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3517 error "(6) unexpected status"
3520 local repaired=$($SHOW_NAMESPACE |
3521 awk '/^nlinks_repaired/ { print $2 }')
3522 [ $repaired -eq 1 ] ||
3523 error "(7) Fail to repair nlink count: $repaired"
3525 cancel_lru_locks mdc
3526 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3527 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3529 run_test 29a "LFSCK can repair bad nlink count (1)"
3533 echo "The object's nlink attribute is smaller than the object's known"
3534 echo "name entries count. The LFSCK will repair the object's nlink"
3535 echo "attribute to match the known name entries count"
3538 check_mount_and_prep
3540 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3541 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3543 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3544 echo "nlink attribute is smaller than its name entries count."
3546 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3547 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3548 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3549 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3550 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3552 cancel_lru_locks mdc
3553 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3554 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3556 echo "Trigger namespace LFSCK to repair the nlink count"
3557 $START_NAMESPACE -r -A ||
3558 error "(5) Fail to start LFSCK for namespace"
3560 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3561 mdd.${MDT_DEV}.lfsck_namespace |
3562 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3564 error "(6) unexpected status"
3567 local repaired=$($SHOW_NAMESPACE |
3568 awk '/^nlinks_repaired/ { print $2 }')
3569 [ $repaired -eq 1 ] ||
3570 error "(7) Fail to repair nlink count: $repaired"
3572 cancel_lru_locks mdc
3573 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3574 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3576 run_test 29b "LFSCK can repair bad nlink count (2)"
3580 echo "There are too much hard links to the object, and exceeds the
3581 echo object's linkEA limitation, as to NOT all the known name entries"
3582 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3583 echo "skip the nlink verification for this object."
3586 check_mount_and_prep
3588 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3589 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3590 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3591 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3593 echo "Inject failure stub on MDT0 to simulate the case that"
3594 echo "foo's hard links exceed the object's linkEA limitation."
3596 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3597 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3598 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3599 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3601 cancel_lru_locks mdc
3603 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3604 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3606 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3607 $LFS fid2path $DIR $foofid
3608 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3609 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3611 echo "Trigger namespace LFSCK to repair the nlink count"
3612 $START_NAMESPACE -r -A ||
3613 error "(7) Fail to start LFSCK for namespace"
3615 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3616 mdd.${MDT_DEV}.lfsck_namespace |
3617 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3619 error "(8) unexpected status"
3622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3623 local repaired=$($SHOW_NAMESPACE |
3624 awk '/^nlinks_repaired/ { print $2 }')
3625 [ $repaired -eq 0 ] ||
3626 error "(9) Repair nlink count unexpcetedly: $repaired"
3628 cancel_lru_locks mdc
3630 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3631 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3633 count2=$($LFS fid2path $DIR $foofid | wc -l)
3634 [ $count2 -eq 2 ] ||
3635 error "(11) Repaired something unexpectedly: $count2"
3637 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3640 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3641 skip "Only support backend /lost+found for ldiskfs" && return
3644 echo "The namespace LFSCK will move the orphans from backend"
3645 echo "/lost+found directory to normal client visible namespace"
3646 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3649 check_mount_and_prep
3651 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3652 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3654 echo "Inject failure stub on MDT0 to simulate the case that"
3655 echo "directory d0 has no linkEA entry, then the LFSCK will"
3656 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3658 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3659 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3660 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3663 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3664 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3666 echo "Inject failure stub on MDT0 to simulate the case that the"
3667 echo "object's name entry will be removed, but not destroy the"
3668 echo "object. Then backend e2fsck will handle it as orphan and"
3669 echo "add them into the backend /lost+found directory."
3671 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3672 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3673 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3674 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3675 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3676 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3679 umount_client $MOUNT || error "(10) Fail to stop client!"
3681 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3684 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3685 error "(12) Fail to run e2fsck"
3687 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3688 error "(13) Fail to start MDT0"
3690 echo "Trigger namespace LFSCK to recover backend orphans"
3691 $START_NAMESPACE -r -A ||
3692 error "(14) Fail to start LFSCK for namespace"
3694 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3695 mdd.${MDT_DEV}.lfsck_namespace |
3696 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3698 error "(15) unexpected status"
3701 local repaired=$($SHOW_NAMESPACE |
3702 awk '/^local_lost_found_moved/ { print $2 }')
3703 [ $repaired -ge 4 ] ||
3704 error "(16) Fail to recover backend orphans: $repaired"
3706 mount_client $MOUNT || error "(17) Fail to start client!"
3708 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3710 ls -ail $MOUNT/.lustre/lost+found/
3712 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3713 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3714 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3716 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3718 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3719 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3721 stat ${cname}/d1 || error "(21) d0 is not recovered"
3722 stat ${cname}/f1 || error "(22) f1 is not recovered"
3724 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3727 [ $MDSCOUNT -lt 2 ] &&
3728 skip "The test needs at least 2 MDTs" && return
3731 echo "For the name entry under a striped directory, if the name"
3732 echo "hash does not match the shard, then the LFSCK will repair"
3733 echo "the bad name entry"
3736 check_mount_and_prep
3738 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3739 error "(1) Fail to create striped directory"
3741 echo "Inject failure stub on client to simulate the case that"
3742 echo "some name entry should be inserted into other non-first"
3743 echo "shard, but inserted into the first shard by wrong"
3745 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3746 $LCTL set_param fail_loc=0x1628 fail_val=0
3747 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3748 error "(2) Fail to create file under striped directory"
3749 $LCTL set_param fail_loc=0 fail_val=0
3751 echo "Trigger namespace LFSCK to repair bad name hash"
3752 $START_NAMESPACE -r -A ||
3753 error "(3) Fail to start LFSCK for namespace"
3755 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3756 mdd.${MDT_DEV}.lfsck_namespace |
3757 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3759 error "(4) unexpected status"
3762 local repaired=$($SHOW_NAMESPACE |
3763 awk '/^name_hash_repaired/ { print $2 }')
3764 [ $repaired -ge 1 ] ||
3765 error "(5) Fail to repair bad name hash: $repaired"
3767 umount_client $MOUNT || error "(6) umount failed"
3768 mount_client $MOUNT || error "(7) mount failed"
3770 for ((i = 0; i < $MDSCOUNT; i++)); do
3771 stat $DIR/$tdir/striped_dir/d$i ||
3772 error "(8) Fail to stat d$i after LFSCK"
3773 rmdir $DIR/$tdir/striped_dir/d$i ||
3774 error "(9) Fail to unlink d$i after LFSCK"
3777 rmdir $DIR/$tdir/striped_dir ||
3778 error "(10) Fail to remove the striped directory after LFSCK"
3780 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3783 [ $MDSCOUNT -lt 2 ] &&
3784 skip "The test needs at least 2 MDTs" && return
3787 echo "For the name entry under a striped directory, if the name"
3788 echo "hash does not match the shard, then the LFSCK will repair"
3789 echo "the bad name entry"
3792 check_mount_and_prep
3794 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3795 error "(1) Fail to create striped directory"
3797 echo "Inject failure stub on client to simulate the case that"
3798 echo "some name entry should be inserted into other non-second"
3799 echo "shard, but inserted into the secod shard by wrong"
3801 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3802 $LCTL set_param fail_loc=0x1628 fail_val=1
3803 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3804 error "(2) Fail to create file under striped directory"
3805 $LCTL set_param fail_loc=0 fail_val=0
3807 echo "Trigger namespace LFSCK to repair bad name hash"
3808 $START_NAMESPACE -r -A ||
3809 error "(3) Fail to start LFSCK for namespace"
3811 wait_update_facet mds2 "$LCTL get_param -n \
3812 mdd.$(facet_svc mds2).lfsck_namespace |
3813 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3814 error "(4) unexpected status"
3816 local repaired=$(do_facet mds2 $LCTL get_param -n \
3817 mdd.$(facet_svc mds2).lfsck_namespace |
3818 awk '/^name_hash_repaired/ { print $2 }')
3819 [ $repaired -ge 1 ] ||
3820 error "(5) Fail to repair bad name hash: $repaired"
3822 umount_client $MOUNT || error "(6) umount failed"
3823 mount_client $MOUNT || error "(7) mount failed"
3825 for ((i = 0; i < $MDSCOUNT; i++)); do
3826 stat $DIR/$tdir/striped_dir/d$i ||
3827 error "(8) Fail to stat d$i after LFSCK"
3828 rmdir $DIR/$tdir/striped_dir/d$i ||
3829 error "(9) Fail to unlink d$i after LFSCK"
3832 rmdir $DIR/$tdir/striped_dir ||
3833 error "(10) Fail to remove the striped directory after LFSCK"
3835 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3838 [ $MDSCOUNT -lt 2 ] &&
3839 skip "The test needs at least 2 MDTs" && return
3842 echo "For some reason, the master MDT-object of the striped directory"
3843 echo "may lost its master LMV EA. If nobody created files under the"
3844 echo "master directly after the master LMV EA lost, then the LFSCK"
3845 echo "should re-generate the master LMV EA."
3848 check_mount_and_prep
3850 echo "Inject failure stub on MDT0 to simulate the case that the"
3851 echo "master MDT-object of the striped directory lost the LMV EA."
3853 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3855 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3856 error "(1) Fail to create striped directory"
3857 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3859 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3860 $START_NAMESPACE -r -A ||
3861 error "(2) Fail to start LFSCK for namespace"
3863 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3864 mdd.${MDT_DEV}.lfsck_namespace |
3865 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3867 error "(3) unexpected status"
3870 local repaired=$($SHOW_NAMESPACE |
3871 awk '/^striped_dirs_repaired/ { print $2 }')
3872 [ $repaired -eq 1 ] ||
3873 error "(4) Fail to re-generate master LMV EA: $repaired"
3875 umount_client $MOUNT || error "(5) umount failed"
3876 mount_client $MOUNT || error "(6) mount failed"
3878 local empty=$(ls $DIR/$tdir/striped_dir/)
3879 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3881 rmdir $DIR/$tdir/striped_dir ||
3882 error "(8) Fail to remove the striped directory after LFSCK"
3884 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3887 [ $MDSCOUNT -lt 2 ] &&
3888 skip "The test needs at least 2 MDTs" && return
3891 echo "For some reason, the master MDT-object of the striped directory"
3892 echo "may lost its master LMV EA. If somebody created files under the"
3893 echo "master directly after the master LMV EA lost, then the LFSCK"
3894 echo "should NOT re-generate the master LMV EA, instead, it should"
3895 echo "change the broken striped dirctory as read-only to prevent"
3896 echo "further damage"
3899 check_mount_and_prep
3901 echo "Inject failure stub on MDT0 to simulate the case that the"
3902 echo "master MDT-object of the striped directory lost the LMV EA."
3904 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3905 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3906 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3907 error "(1) Fail to create striped directory"
3908 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
3910 umount_client $MOUNT || error "(2) umount failed"
3911 mount_client $MOUNT || error "(3) mount failed"
3913 touch $DIR/$tdir/striped_dir/dummy ||
3914 error "(4) Fail to touch under broken striped directory"
3916 echo "Trigger namespace LFSCK to find out the inconsistency"
3917 $START_NAMESPACE -r -A ||
3918 error "(5) Fail to start LFSCK for namespace"
3920 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3921 mdd.${MDT_DEV}.lfsck_namespace |
3922 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3924 error "(6) unexpected status"
3927 local repaired=$($SHOW_NAMESPACE |
3928 awk '/^striped_dirs_repaired/ { print $2 }')
3929 [ $repaired -eq 0 ] ||
3930 error "(7) Re-generate master LMV EA unexpected: $repaired"
3932 stat $DIR/$tdir/striped_dir/dummy ||
3933 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
3935 touch $DIR/$tdir/striped_dir/foo &&
3936 error "(9) The broken striped directory should be read-only"
3938 chattr -i $DIR/$tdir/striped_dir ||
3939 error "(10) Fail to chattr on the broken striped directory"
3941 rmdir $DIR/$tdir/striped_dir ||
3942 error "(11) Fail to remove the striped directory after LFSCK"
3944 run_test 31d "Set broken striped directory (modified after broken) as read-only"
3947 [ $MDSCOUNT -lt 2 ] &&
3948 skip "The test needs at least 2 MDTs" && return
3951 echo "For some reason, the slave MDT-object of the striped directory"
3952 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
3953 echo "slave LMV EA."
3956 check_mount_and_prep
3958 echo "Inject failure stub on MDT0 to simulate the case that the"
3959 echo "slave MDT-object (that resides on the same MDT as the master"
3960 echo "MDT-object resides on) lost the LMV EA."
3962 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
3963 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
3964 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3965 error "(1) Fail to create striped directory"
3966 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
3968 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
3969 $START_NAMESPACE -r -A ||
3970 error "(2) Fail to start LFSCK for namespace"
3972 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3973 mdd.${MDT_DEV}.lfsck_namespace |
3974 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3976 error "(3) unexpected status"
3979 local repaired=$($SHOW_NAMESPACE |
3980 awk '/^striped_shards_repaired/ { print $2 }')
3981 [ $repaired -eq 1 ] ||
3982 error "(4) Fail to re-generate slave LMV EA: $repaired"
3984 rmdir $DIR/$tdir/striped_dir ||
3985 error "(5) Fail to remove the striped directory after LFSCK"
3987 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
3990 [ $MDSCOUNT -lt 2 ] &&
3991 skip "The test needs at least 2 MDTs" && return
3994 echo "For some reason, the slave MDT-object of the striped directory"
3995 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
3996 echo "slave LMV EA."
3999 check_mount_and_prep
4001 echo "Inject failure stub on MDT0 to simulate the case that the"
4002 echo "slave MDT-object (that resides on differnt MDT as the master"
4003 echo "MDT-object resides on) lost the LMV EA."
4005 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4007 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4008 error "(1) Fail to create striped directory"
4009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4011 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4012 $START_NAMESPACE -r -A ||
4013 error "(2) Fail to start LFSCK for namespace"
4015 wait_update_facet mds2 "$LCTL get_param -n \
4016 mdd.$(facet_svc mds2).lfsck_namespace |
4017 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4018 error "(3) unexpected status"
4020 local repaired=$(do_facet mds2 $LCTL get_param -n \
4021 mdd.$(facet_svc mds2).lfsck_namespace |
4022 awk '/^striped_shards_repaired/ { print $2 }')
4023 [ $repaired -eq 1 ] ||
4024 error "(4) Fail to re-generate slave LMV EA: $repaired"
4026 rmdir $DIR/$tdir/striped_dir ||
4027 error "(5) Fail to remove the striped directory after LFSCK"
4029 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4032 [ $MDSCOUNT -lt 2 ] &&
4033 skip "The test needs at least 2 MDTs" && return
4036 echo "For some reason, the stripe index in the slave LMV EA is"
4037 echo "corrupted. The LFSCK should repair the slave LMV EA."
4040 check_mount_and_prep
4042 echo "Inject failure stub on MDT0 to simulate the case that the"
4043 echo "slave LMV EA on the first shard of the striped directory"
4044 echo "claims the same index as the second shard claims"
4046 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4047 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4048 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4049 error "(1) Fail to create striped directory"
4050 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4052 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4053 $START_NAMESPACE -r -A ||
4054 error "(2) Fail to start LFSCK for namespace"
4056 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4057 mdd.${MDT_DEV}.lfsck_namespace |
4058 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4060 error "(3) unexpected status"
4063 local repaired=$($SHOW_NAMESPACE |
4064 awk '/^striped_shards_repaired/ { print $2 }')
4065 [ $repaired -eq 1 ] ||
4066 error "(4) Fail to repair slave LMV EA: $repaired"
4068 umount_client $MOUNT || error "(5) umount failed"
4069 mount_client $MOUNT || error "(6) mount failed"
4071 touch $DIR/$tdir/striped_dir/foo ||
4072 error "(7) Fail to touch file after the LFSCK"
4074 rm -f $DIR/$tdir/striped_dir/foo ||
4075 error "(8) Fail to unlink file after the LFSCK"
4077 rmdir $DIR/$tdir/striped_dir ||
4078 error "(9) Fail to remove the striped directory after LFSCK"
4080 run_test 31g "Repair the corrupted slave LMV EA"
4083 [ $MDSCOUNT -lt 2 ] &&
4084 skip "The test needs at least 2 MDTs" && return
4087 echo "For some reason, the shard's name entry in the striped"
4088 echo "directory may be corrupted. The LFSCK should repair the"
4089 echo "bad shard's name entry."
4092 check_mount_and_prep
4094 echo "Inject failure stub on MDT0 to simulate the case that the"
4095 echo "first shard's name entry in the striped directory claims"
4096 echo "the same index as the second shard's name entry claims."
4098 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4100 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4101 error "(1) Fail to create striped directory"
4102 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4104 echo "Trigger namespace LFSCK to repair the shard's name entry"
4105 $START_NAMESPACE -r -A ||
4106 error "(2) Fail to start LFSCK for namespace"
4108 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4109 mdd.${MDT_DEV}.lfsck_namespace |
4110 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4112 error "(3) unexpected status"
4115 local repaired=$($SHOW_NAMESPACE |
4116 awk '/^dirent_repaired/ { print $2 }')
4117 [ $repaired -eq 1 ] ||
4118 error "(4) Fail to repair shard's name entry: $repaired"
4120 umount_client $MOUNT || error "(5) umount failed"
4121 mount_client $MOUNT || error "(6) mount failed"
4123 touch $DIR/$tdir/striped_dir/foo ||
4124 error "(7) Fail to touch file after the LFSCK"
4126 rm -f $DIR/$tdir/striped_dir/foo ||
4127 error "(8) Fail to unlink file after the LFSCK"
4129 rmdir $DIR/$tdir/striped_dir ||
4130 error "(9) Fail to remove the striped directory after LFSCK"
4132 run_test 31h "Repair the corrupted shard's name entry"
4134 $LCTL set_param debug=-lfsck > /dev/null || true
4136 # restore MDS/OST size
4137 MDSSIZE=${SAVED_MDSSIZE}
4138 OSTSIZE=${SAVED_OSTSIZE}
4139 OSTCOUNT=${SAVED_OSTCOUNT}
4141 # cleanup the system at last