3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
24 SAVED_MDSSIZE=${MDSSIZE}
25 SAVED_OSTSIZE=${OSTSIZE}
26 SAVED_OSTCOUNT=${OSTCOUNT}
27 # use small MDS + OST size to speed formatting time
28 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
31 # no need too much OSTs, to reduce the format/start/stop overhead
32 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
34 # build up a clean test environment.
38 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
39 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
42 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
43 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
45 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26"
53 $LCTL set_param debug=+lfsck > /dev/null || true
55 MDT_DEV="${FSNAME}-MDT0000"
56 OST_DEV="${FSNAME}-OST0000"
57 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
58 START_NAMESPACE="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
60 START_LAYOUT="do_facet $SINGLEMDS \
61 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
62 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
63 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
64 SHOW_NAMESPACE="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
66 SHOW_LAYOUT="do_facet $SINGLEMDS \
67 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
68 SHOW_LAYOUT_ON_OST="do_facet ost1 \
69 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
70 MOUNT_OPTS_SCRUB="-o user_xattr"
71 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
110 #define OBD_FAIL_LFSCK_DELAY1 0x1600
111 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
112 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
114 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
116 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
117 [ "$STATUS" == "scanning-phase1" ] ||
118 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
120 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
122 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
123 [ "$STATUS" == "stopped" ] ||
124 error "(6) Expect 'stopped', but got '$STATUS'"
126 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
128 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
129 [ "$STATUS" == "scanning-phase1" ] ||
130 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
132 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
133 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
134 mdd.${MDT_DEV}.lfsck_namespace |
135 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
137 error "(9) unexpected status"
140 local repaired=$($SHOW_NAMESPACE |
141 awk '/^updated_phase1/ { print $2 }')
142 [ $repaired -eq 0 ] ||
143 error "(10) Expect nothing to be repaired, but got: $repaired"
145 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
146 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
147 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
148 mdd.${MDT_DEV}.lfsck_namespace |
149 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
151 error "(12) unexpected status"
154 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
155 [ $((scanned1 + 1)) -eq $scanned2 ] ||
156 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
158 echo "stopall, should NOT crash LU-3649"
159 stopall || error "(14) Fail to stopall"
161 run_test 0 "Control LFSCK manually"
164 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
165 skip "OI Scrub not implemented for ZFS" && return
169 #define OBD_FAIL_FID_INDIR 0x1501
170 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
171 touch $DIR/$tdir/dummy
173 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
175 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
176 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
177 mdd.${MDT_DEV}.lfsck_namespace |
178 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
180 error "(4) unexpected status"
183 local repaired=$($SHOW_NAMESPACE |
184 awk '/^dirent_repaired/ { print $2 }')
185 # for interop with old server
186 [ -z "$repaired" ] &&
187 repaired=$($SHOW_NAMESPACE |
188 awk '/^updated_phase1/ { print $2 }')
190 [ $repaired -eq 1 ] ||
191 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
193 mount_client $MOUNT || error "(6) Fail to start client!"
195 #define OBD_FAIL_FID_LOOKUP 0x1505
196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
197 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
199 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
201 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
205 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
206 skip "OI Scrub not implemented for ZFS" && return
210 #define OBD_FAIL_FID_INLMA 0x1502
211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
212 touch $DIR/$tdir/dummy
214 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
216 #define OBD_FAIL_FID_NOLMA 0x1506
217 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
218 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
219 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
220 mdd.${MDT_DEV}.lfsck_namespace |
221 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
223 error "(4) unexpected status"
226 local repaired=$($SHOW_NAMESPACE |
227 awk '/^dirent_repaired/ { print $2 }')
228 # for interop with old server
229 [ -z "$repaired" ] &&
230 repaired=$($SHOW_NAMESPACE |
231 awk '/^updated_phase1/ { print $2 }')
233 [ $repaired -eq 1 ] ||
234 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
236 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
237 mount_client $MOUNT || error "(6) Fail to start client!"
239 #define OBD_FAIL_FID_LOOKUP 0x1505
240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
241 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
243 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
245 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
250 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
252 touch $DIR/$tdir/dummy
254 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
256 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
257 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
258 mdd.${MDT_DEV}.lfsck_namespace |
259 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
261 error "(4) unexpected status"
264 local repaired=$($SHOW_NAMESPACE |
265 awk '/^linkea_repaired/ { print $2 }')
266 # for interop with old server
267 [ -z "$repaired" ] &&
268 repaired=$($SHOW_NAMESPACE |
269 awk '/^updated_phase2/ { print $2 }')
271 [ $repaired -eq 1 ] ||
272 error "(5) Fail to repair crashed linkEA: $repaired"
274 mount_client $MOUNT || error "(6) Fail to start client!"
276 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
277 error "(7) Fail to stat $DIR/$tdir/dummy"
279 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
280 local dummyname=$($LFS fid2path $DIR $dummyfid)
281 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
282 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
284 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
290 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
291 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
292 touch $DIR/$tdir/dummy
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
296 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
297 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
298 mdd.${MDT_DEV}.lfsck_namespace |
299 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
301 error "(4) unexpected status"
304 local repaired=$($SHOW_NAMESPACE |
305 awk '/^updated_phase2/ { print $2 }')
306 [ $repaired -eq 1 ] ||
307 error "(5) Fail to repair crashed linkEA: $repaired"
309 mount_client $MOUNT || error "(6) Fail to start client!"
311 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
312 error "(7) Fail to stat $DIR/$tdir/dummy"
314 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
315 local dummyname=$($LFS fid2path $DIR $dummyfid)
316 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
317 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
319 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
325 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
326 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
327 touch $DIR/$tdir/dummy
329 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
331 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
332 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
333 mdd.${MDT_DEV}.lfsck_namespace |
334 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
336 error "(4) unexpected status"
339 local repaired=$($SHOW_NAMESPACE |
340 awk '/^updated_phase2/ { print $2 }')
341 [ $repaired -eq 1 ] ||
342 error "(5) Fail to repair crashed linkEA: $repaired"
344 mount_client $MOUNT || error "(6) Fail to start client!"
346 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
347 error "(7) Fail to stat $DIR/$tdir/dummy"
349 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
350 local dummyname=$($LFS fid2path $DIR $dummyfid)
351 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
352 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
354 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
360 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
361 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
362 touch $DIR/$tdir/dummy
364 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
366 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
367 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
368 mdd.${MDT_DEV}.lfsck_namespace |
369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
371 error "(4) unexpected status"
374 local repaired=$($SHOW_NAMESPACE |
375 awk '/^linkea_repaired/ { print $2 }')
376 [ $repaired -eq 1 ] ||
377 error "(5) Fail to repair crashed linkEA: $repaired"
379 mount_client $MOUNT || error "(6) Fail to start client!"
381 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
382 error "(7) Fail to stat $DIR/$tdir/dummy"
384 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
385 local dummyname=$($LFS fid2path $DIR $dummyfid)
386 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
387 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
389 run_test 2d "LFSCK can recover the missing linkEA entry"
393 [ $MDSCOUNT -lt 2 ] &&
394 skip "We need at least 2 MDSes for this test" && return
398 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
400 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
401 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
402 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
405 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
406 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
407 mdd.${MDT_DEV}.lfsck_namespace |
408 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
410 error "(4) unexpected status"
413 local repaired=$($SHOW_NAMESPACE |
414 awk '/^linkea_repaired/ { print $2 }')
415 [ $repaired -eq 1 ] ||
416 error "(5) Fail to repair crashed linkEA: $repaired"
418 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
419 local name=$($LFS fid2path $DIR $fid)
420 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
421 error "(6) Fail to repair linkEA: $fid $name"
423 run_test 2e "namespace LFSCK can verify remote object linkEA"
429 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
430 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
431 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
433 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
434 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
435 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
437 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
439 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
441 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
443 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
445 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
447 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
448 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
449 mdd.${MDT_DEV}.lfsck_namespace |
450 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
452 error "(10) unexpected status"
455 local checked=$($SHOW_NAMESPACE |
456 awk '/^checked_phase2/ { print $2 }')
457 [ $checked -ge 4 ] ||
458 error "(11) Fail to check multiple-linked object: $checked"
460 local repaired=$($SHOW_NAMESPACE |
461 awk '/^multiple_linked_repaired/ { print $2 }')
462 [ $repaired -ge 2 ] ||
463 error "(12) Fail to repair multiple-linked object: $repaired"
465 run_test 3 "LFSCK can verify multiple-linked objects"
469 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
470 skip "OI Scrub not implemented for ZFS" && return
473 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
474 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
476 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
477 echo "start $SINGLEMDS with disabling OI scrub"
478 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
479 error "(2) Fail to start MDS!"
481 #define OBD_FAIL_LFSCK_DELAY2 0x1601
482 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
483 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
484 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
485 mdd.${MDT_DEV}.lfsck_namespace |
486 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
488 error "(5) unexpected status"
491 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
492 [ "$STATUS" == "scanning-phase1" ] ||
493 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
495 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(7) unexpected status"
503 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
504 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
506 local repaired=$($SHOW_NAMESPACE |
507 awk '/^dirent_repaired/ { print $2 }')
508 # for interop with old server
509 [ -z "$repaired" ] &&
510 repaired=$($SHOW_NAMESPACE |
511 awk '/^updated_phase1/ { print $2 }')
513 [ $repaired -ge 9 ] ||
514 error "(9) Fail to re-generate FID-in-dirent: $repaired"
516 mount_client $MOUNT || error "(10) Fail to start client!"
518 #define OBD_FAIL_FID_LOOKUP 0x1505
519 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
520 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
521 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
523 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
527 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
528 skip "OI Scrub not implemented for ZFS" && return
531 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
532 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
534 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
535 echo "start $SINGLEMDS with disabling OI scrub"
536 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
537 error "(2) Fail to start MDS!"
539 #define OBD_FAIL_LFSCK_DELAY2 0x1601
540 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
541 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
542 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
543 mdd.${MDT_DEV}.lfsck_namespace |
544 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
546 error "(5) unexpected status"
549 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
550 [ "$STATUS" == "scanning-phase1" ] ||
551 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
554 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
555 mdd.${MDT_DEV}.lfsck_namespace |
556 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
558 error "(7) unexpected status"
561 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
562 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
564 local repaired=$($SHOW_NAMESPACE |
565 awk '/^dirent_repaired/ { print $2 }')
566 # for interop with old server
567 [ -z "$repaired" ] &&
568 repaired=$($SHOW_NAMESPACE |
569 awk '/^updated_phase1/ { print $2 }')
571 [ $repaired -ge 2 ] ||
572 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
574 mount_client $MOUNT || error "(10) Fail to start client!"
576 #define OBD_FAIL_FID_LOOKUP 0x1505
577 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
578 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
580 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
583 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
584 local dummyname=$($LFS fid2path $DIR $dummyfid)
585 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
586 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
588 run_test 5 "LFSCK can handle IGIF object upgrading"
593 #define OBD_FAIL_LFSCK_DELAY1 0x1600
594 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
595 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
597 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
598 [ "$STATUS" == "scanning-phase1" ] ||
599 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
601 # Sleep 3 sec to guarantee at least one object processed by LFSCK
603 # Fail the LFSCK to guarantee there is at least one checkpoint
604 #define OBD_FAIL_LFSCK_FATAL1 0x1608
605 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
607 mdd.${MDT_DEV}.lfsck_namespace |
608 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
610 error "(4) unexpected status"
613 local POS0=$($SHOW_NAMESPACE |
614 awk '/^last_checkpoint_position/ { print $2 }' |
617 #define OBD_FAIL_LFSCK_DELAY1 0x1600
618 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
619 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
621 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
622 [ "$STATUS" == "scanning-phase1" ] ||
623 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
625 local POS1=$($SHOW_NAMESPACE |
626 awk '/^latest_start_position/ { print $2 }' |
628 [[ $POS0 -lt $POS1 ]] ||
629 error "(7) Expect larger than: $POS0, but got $POS1"
631 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
632 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
633 mdd.${MDT_DEV}.lfsck_namespace |
634 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
636 error "(8) unexpected status"
639 run_test 6a "LFSCK resumes from last checkpoint (1)"
644 #define OBD_FAIL_LFSCK_DELAY2 0x1601
645 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
646 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
648 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
649 [ "$STATUS" == "scanning-phase1" ] ||
650 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
652 # Sleep 5 sec to guarantee that we are in the directory scanning
654 # Fail the LFSCK to guarantee there is at least one checkpoint
655 #define OBD_FAIL_LFSCK_FATAL2 0x1609
656 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
657 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
658 mdd.${MDT_DEV}.lfsck_namespace |
659 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
661 error "(4) unexpected status"
664 local O_POS0=$($SHOW_NAMESPACE |
665 awk '/^last_checkpoint_position/ { print $2 }' |
668 local D_POS0=$($SHOW_NAMESPACE |
669 awk '/^last_checkpoint_position/ { print $4 }')
671 #define OBD_FAIL_LFSCK_DELAY2 0x1601
672 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
673 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
675 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
676 [ "$STATUS" == "scanning-phase1" ] ||
677 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
679 local O_POS1=$($SHOW_NAMESPACE |
680 awk '/^latest_start_position/ { print $2 }' |
682 local D_POS1=$($SHOW_NAMESPACE |
683 awk '/^latest_start_position/ { print $4 }')
685 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
686 [[ $O_POS0 -lt $O_POS1 ]] ||
687 error "(7.1) $O_POS1 is not larger than $O_POS0"
689 [[ $D_POS0 -lt $D_POS1 ]] ||
690 error "(7.2) $D_POS1 is not larger than $D_POS0"
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
694 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
695 mdd.${MDT_DEV}.lfsck_namespace |
696 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
698 error "(8) unexpected status"
701 run_test 6b "LFSCK resumes from last checkpoint (2)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 3 sec to guarantee at least one object processed by LFSCK
718 echo "stop $SINGLEMDS"
719 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
721 echo "start $SINGLEMDS"
722 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
723 error "(5) Fail to start MDS!"
725 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
726 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
727 mdd.${MDT_DEV}.lfsck_namespace |
728 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
730 error "(6) unexpected status"
733 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
739 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
741 for ((i = 0; i < 20; i++)); do
742 touch $DIR/$tdir/dummy${i}
745 #define OBD_FAIL_LFSCK_DELAY3 0x1602
746 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
747 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
748 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
749 mdd.${MDT_DEV}.lfsck_namespace |
750 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
752 error "(4) unexpected status"
755 echo "stop $SINGLEMDS"
756 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
758 echo "start $SINGLEMDS"
759 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
760 error "(6) Fail to start MDS!"
762 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
764 mdd.${MDT_DEV}.lfsck_namespace |
765 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
767 error "(7) unexpected status"
770 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
775 formatall > /dev/null
781 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
782 [ "$STATUS" == "init" ] ||
783 error "(2) Expect 'init', but got '$STATUS'"
785 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
786 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
787 mkdir $DIR/$tdir/crashed
789 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
790 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
791 for ((i = 0; i < 5; i++)); do
792 touch $DIR/$tdir/dummy${i}
795 umount_client $MOUNT || error "(3) Fail to stop client!"
797 #define OBD_FAIL_LFSCK_DELAY2 0x1601
798 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
799 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
801 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
802 [ "$STATUS" == "scanning-phase1" ] ||
803 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
805 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
807 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
808 [ "$STATUS" == "stopped" ] ||
809 error "(7) Expect 'stopped', but got '$STATUS'"
811 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
813 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
814 [ "$STATUS" == "scanning-phase1" ] ||
815 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
817 #define OBD_FAIL_LFSCK_FATAL2 0x1609
818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
819 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
820 mdd.${MDT_DEV}.lfsck_namespace |
821 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
823 error "(10) unexpected status"
826 #define OBD_FAIL_LFSCK_DELAY1 0x1600
827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
828 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
830 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
831 [ "$STATUS" == "scanning-phase1" ] ||
832 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
834 #define OBD_FAIL_LFSCK_CRASH 0x160a
835 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
838 echo "stop $SINGLEMDS"
839 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
841 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
844 echo "start $SINGLEMDS"
845 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
846 error "(14) Fail to start MDS!"
848 local timeout=$(max_recovery_time)
851 while [ $timer -lt $timeout ]; do
852 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
853 mdt.${MDT_DEV}.recovery_status |
854 awk '/^status/ { print \\\$2 }'")
855 [ "$STATUS" != "RECOVERING" ] && break;
860 [ $timer != $timeout ] ||
861 error "(14.1) recovery timeout"
863 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
864 [ "$STATUS" == "crashed" ] ||
865 error "(15) Expect 'crashed', but got '$STATUS'"
867 #define OBD_FAIL_LFSCK_DELAY2 0x1601
868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
869 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "scanning-phase1" ] ||
873 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
875 echo "stop $SINGLEMDS"
876 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
878 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
879 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
881 echo "start $SINGLEMDS"
882 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
883 error "(19) Fail to start MDS!"
886 while [ $timer -lt $timeout ]; do
887 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
888 mdt.${MDT_DEV}.recovery_status |
889 awk '/^status/ { print \\\$2 }'")
890 [ "$STATUS" != "RECOVERING" ] && break;
895 [ $timer != $timeout ] ||
896 error "(19.1) recovery timeout"
898 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
899 [ "$STATUS" == "paused" ] ||
900 error "(20) Expect 'paused', but got '$STATUS'"
902 #define OBD_FAIL_LFSCK_DELAY3 0x1602
903 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
905 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
906 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
907 mdd.${MDT_DEV}.lfsck_namespace |
908 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
910 error "(22) unexpected status"
913 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
914 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
915 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
917 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
918 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
919 mdd.${MDT_DEV}.lfsck_namespace |
920 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
922 error "(24) unexpected status"
925 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
926 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
928 run_test 8 "LFSCK state machine"
931 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
932 skip "Testing on UP system, the speed may be inaccurate."
938 local BASE_SPEED1=100
940 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
943 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
944 [ "$STATUS" == "scanning-phase1" ] ||
945 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
947 local SPEED=$($SHOW_NAMESPACE |
948 awk '/^average_speed_phase1/ { print $2 }')
950 # There may be time error, normally it should be less than 2 seconds.
951 # We allow another 20% schedule error.
953 # MAX_MARGIN = 1.2 = 12 / 10
954 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
955 RUN_TIME1 * 12 / 10))
956 [ $SPEED -lt $MAX_SPEED ] ||
957 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
960 local BASE_SPEED2=300
962 do_facet $SINGLEMDS \
963 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
966 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
967 # MIN_MARGIN = 0.8 = 8 / 10
968 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
969 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
970 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
971 # Account for slow ZFS performance - LU-4934
972 [ $SPEED -gt $MIN_SPEED ] || [ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
973 error "(5) Got speed $SPEED, expected more than $MIN_SPEED"
975 # MAX_MARGIN = 1.2 = 12 / 10
976 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
977 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
978 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
979 [ $SPEED -lt $MAX_SPEED ] ||
980 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
982 do_facet $SINGLEMDS \
983 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
985 wait_update_facet $SINGLEMDS \
986 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
987 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
988 error "(7) Failed to get expected 'completed'"
990 run_test 9a "LFSCK speed control (1)"
993 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
994 skip "Testing on UP system, the speed may be inaccurate."
1000 echo "Preparing another 50 * 50 files (with error) at $(date)."
1001 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1002 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1003 createmany -d $DIR/$tdir/d 50
1004 createmany -m $DIR/$tdir/f 50
1005 for ((i = 0; i < 50; i++)); do
1006 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1009 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1011 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1012 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1013 mdd.${MDT_DEV}.lfsck_namespace |
1014 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1016 error "(5) unexpected status"
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1020 echo "Prepared at $(date)."
1022 local BASE_SPEED1=50
1024 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1027 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1028 [ "$STATUS" == "scanning-phase2" ] ||
1029 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1031 local SPEED=$($SHOW_NAMESPACE |
1032 awk '/^average_speed_phase2/ { print $2 }')
1033 # There may be time error, normally it should be less than 2 seconds.
1034 # We allow another 20% schedule error.
1036 # MAX_MARGIN = 1.2 = 12 / 10
1037 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1038 RUN_TIME1 * 12 / 10))
1039 [ $SPEED -lt $MAX_SPEED ] ||
1040 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1042 # adjust speed limit
1043 local BASE_SPEED2=150
1045 do_facet $SINGLEMDS \
1046 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1049 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1050 # MIN_MARGIN = 0.8 = 8 / 10
1051 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1052 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1053 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1054 [ $SPEED -gt $MIN_SPEED ] ||[ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
1055 error "(9) Got speed $SPEED, expected more than $MIN_SPEED"
1057 # MAX_MARGIN = 1.2 = 12 / 10
1058 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1059 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1060 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1061 [ $SPEED -lt $MAX_SPEED ] ||
1062 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1064 do_facet $SINGLEMDS \
1065 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1066 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1067 mdd.${MDT_DEV}.lfsck_namespace |
1068 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1070 error "(11) unexpected status"
1073 run_test 9b "LFSCK speed control (2)"
1077 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1078 skip "lookup(..)/linkea on ZFS issue" && return
1082 echo "Preparing more files with error at $(date)."
1083 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1084 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1086 for ((i = 0; i < 1000; i = $((i+2)))); do
1087 mkdir -p $DIR/$tdir/d${i}
1088 touch $DIR/$tdir/f${i}
1089 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1092 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1095 for ((i = 1; i < 1000; i = $((i+2)))); do
1096 mkdir -p $DIR/$tdir/d${i}
1097 touch $DIR/$tdir/f${i}
1098 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1102 echo "Prepared at $(date)."
1104 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1106 umount_client $MOUNT
1107 mount_client $MOUNT || error "(3) Fail to start client!"
1109 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1112 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1113 [ "$STATUS" == "scanning-phase1" ] ||
1114 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1116 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1118 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1120 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1122 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1124 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1126 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1128 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1130 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1131 error "(14) Fail to softlink!"
1133 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1134 [ "$STATUS" == "scanning-phase1" ] ||
1135 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1137 do_facet $SINGLEMDS \
1138 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1139 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1140 mdd.${MDT_DEV}.lfsck_namespace |
1141 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1143 error "(16) unexpected status"
1146 run_test 10 "System is available during LFSCK scanning"
1149 ost_remove_lastid() {
1152 local rcmd="do_facet ost${ost}"
1154 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1156 # step 1: local mount
1157 mount_fstype ost${ost} || return 1
1158 # step 2: remove the specified LAST_ID
1159 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1161 unmount_fstype ost${ost} || return 2
1165 check_mount_and_prep
1166 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1167 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1172 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1174 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1175 error "(2) Fail to start ost1"
1177 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1178 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1180 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1181 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1183 wait_update_facet ost1 "$LCTL get_param -n \
1184 obdfilter.${OST_DEV}.lfsck_layout |
1185 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1187 error "(5) unexpected status"
1190 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1192 wait_update_facet ost1 "$LCTL get_param -n \
1193 obdfilter.${OST_DEV}.lfsck_layout |
1194 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1196 error "(6) unexpected status"
1199 echo "the LAST_ID(s) should have been rebuilt"
1200 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1201 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1203 run_test 11a "LFSCK can rebuild lost last_id"
1206 check_mount_and_prep
1207 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1209 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1210 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1211 do_facet ost1 $LCTL set_param fail_loc=0x160d
1212 createmany -o $DIR/$tdir/f 64
1213 local lastid1=$(do_facet ost1 "lctl get_param -n \
1214 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1215 awk -F: '{ print $2 }')
1217 umount_client $MOUNT
1218 stop ost1 || error "(1) Fail to stop ost1"
1220 #define OBD_FAIL_OST_ENOSPC 0x215
1221 do_facet ost1 $LCTL set_param fail_loc=0x215
1223 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1224 error "(2) Fail to start ost1"
1226 for ((i = 0; i < 60; i++)); do
1227 lastid2=$(do_facet ost1 "lctl get_param -n \
1228 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1229 awk -F: '{ print $2 }')
1230 [ ! -z $lastid2 ] && break;
1234 echo "the on-disk LAST_ID should be smaller than the expected one"
1235 [ $lastid1 -gt $lastid2 ] ||
1236 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1238 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1239 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1241 wait_update_facet ost1 "$LCTL get_param -n \
1242 obdfilter.${OST_DEV}.lfsck_layout |
1243 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1245 error "(6) unexpected status"
1248 stop ost1 || error "(7) Fail to stop ost1"
1250 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1251 error "(8) Fail to start ost1"
1253 echo "the on-disk LAST_ID should have been rebuilt"
1254 wait_update_facet ost1 "$LCTL get_param -n \
1255 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1256 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1257 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1258 error "(9) expect lastid1 0x100000000:$lastid1"
1261 do_facet ost1 $LCTL set_param fail_loc=0
1262 stopall || error "(10) Fail to stopall"
1264 run_test 11b "LFSCK can rebuild crashed last_id"
1267 [ $MDSCOUNT -lt 2 ] &&
1268 skip "We need at least 2 MDSes for test_12" && return
1270 check_mount_and_prep
1271 for k in $(seq $MDSCOUNT); do
1272 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1273 createmany -o $DIR/$tdir/${k}/f 100 ||
1274 error "(0) Fail to create 100 files."
1277 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1278 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1279 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1281 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1282 for k in $(seq $MDSCOUNT); do
1283 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1284 mdd.$(facet_svc mds${k}).lfsck_namespace |
1285 awk '/^status/ { print $2 }')
1286 [ "$STATUS" == "scanning-phase1" ] ||
1287 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1290 echo "Stop namespace LFSCK on all targets by single lctl command."
1291 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1292 error "(4) Fail to stop LFSCK on all devices!"
1294 echo "All the LFSCK targets should be in 'stopped' status."
1295 for k in $(seq $MDSCOUNT); do
1296 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1297 mdd.$(facet_svc mds${k}).lfsck_namespace |
1298 awk '/^status/ { print $2 }')
1299 [ "$STATUS" == "stopped" ] ||
1300 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1303 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1304 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1305 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1307 echo "All the LFSCK targets should be in 'completed' status."
1308 for k in $(seq $MDSCOUNT); do
1309 wait_update_facet mds${k} "$LCTL get_param -n \
1310 mdd.$(facet_svc mds${k}).lfsck_namespace |
1311 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1312 error "(7) MDS${k} is not the expected 'completed'"
1315 echo "Start layout LFSCK on all targets by single command (-s 1)."
1316 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1317 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1319 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1320 for k in $(seq $MDSCOUNT); do
1321 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1322 mdd.$(facet_svc mds${k}).lfsck_layout |
1323 awk '/^status/ { print $2 }')
1324 [ "$STATUS" == "scanning-phase1" ] ||
1325 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1328 echo "Stop layout LFSCK on all targets by single lctl command."
1329 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1330 error "(10) Fail to stop LFSCK on all devices!"
1332 echo "All the LFSCK targets should be in 'stopped' status."
1333 for k in $(seq $MDSCOUNT); do
1334 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1335 mdd.$(facet_svc mds${k}).lfsck_layout |
1336 awk '/^status/ { print $2 }')
1337 [ "$STATUS" == "stopped" ] ||
1338 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1341 for k in $(seq $OSTCOUNT); do
1342 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1343 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1344 awk '/^status/ { print $2 }')
1345 [ "$STATUS" == "stopped" ] ||
1346 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1349 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1350 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1351 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1353 echo "All the LFSCK targets should be in 'completed' status."
1354 for k in $(seq $MDSCOUNT); do
1355 # The LFSCK status query internal is 30 seconds. For the case
1356 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1357 # time to guarantee the status sync up.
1358 wait_update_facet mds${k} "$LCTL get_param -n \
1359 mdd.$(facet_svc mds${k}).lfsck_layout |
1360 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1361 error "(14) MDS${k} is not the expected 'completed'"
1364 run_test 12 "single command to trigger LFSCK on all devices"
1368 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1369 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1370 echo "MDT-object FID."
1373 check_mount_and_prep
1375 echo "Inject failure stub to simulate bad lmm_oi"
1376 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1377 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1378 createmany -o $DIR/$tdir/f 32
1379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1381 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1382 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1384 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1385 mdd.${MDT_DEV}.lfsck_layout |
1386 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1388 error "(2) unexpected status"
1391 local repaired=$($SHOW_LAYOUT |
1392 awk '/^repaired_others/ { print $2 }')
1393 [ $repaired -eq 32 ] ||
1394 error "(3) Fail to repair crashed lmm_oi: $repaired"
1396 run_test 13 "LFSCK can repair crashed lmm_oi"
1400 echo "The OST-object referenced by the MDT-object should be there;"
1401 echo "otherwise, the LFSCK should re-create the missing OST-object."
1404 check_mount_and_prep
1405 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1407 local count=$(precreated_ost_obj_count 0 0)
1409 echo "Inject failure stub to simulate dangling referenced MDT-object"
1410 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1411 do_facet ost1 $LCTL set_param fail_loc=0x1610
1412 createmany -o $DIR/$tdir/f $((count + 31))
1413 touch $DIR/$tdir/guard
1414 do_facet ost1 $LCTL set_param fail_loc=0
1416 start_full_debug_logging
1418 # exhaust other pre-created dangling cases
1419 count=$(precreated_ost_obj_count 0 0)
1420 createmany -o $DIR/$tdir/a $count ||
1421 error "(0) Fail to create $count files."
1423 echo "'ls' should fail because of dangling referenced MDT-object"
1424 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1426 echo "Trigger layout LFSCK to find out dangling reference"
1427 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1430 mdd.${MDT_DEV}.lfsck_layout |
1431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1433 error "(3) unexpected status"
1436 local repaired=$($SHOW_LAYOUT |
1437 awk '/^repaired_dangling/ { print $2 }')
1438 [ $repaired -ge 32 ] ||
1439 error "(4) Fail to repair dangling reference: $repaired"
1441 echo "'stat' should fail because of not repair dangling by default"
1442 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1444 echo "Trigger layout LFSCK to repair dangling reference"
1445 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1447 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1448 mdd.${MDT_DEV}.lfsck_layout |
1449 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1451 error "(7) unexpected status"
1454 # There may be some async LFSCK updates in processing, wait for
1455 # a while until the target reparation has been done. LU-4970.
1457 echo "'stat' should success after layout LFSCK repairing"
1458 wait_update_facet client "stat $DIR/$tdir/guard |
1459 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1460 stat $DIR/$tdir/guard
1462 error "(8) unexpected size"
1465 repaired=$($SHOW_LAYOUT |
1466 awk '/^repaired_dangling/ { print $2 }')
1467 [ $repaired -ge 32 ] ||
1468 error "(9) Fail to repair dangling reference: $repaired"
1470 stop_full_debug_logging
1472 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1476 echo "If the OST-object referenced by the MDT-object back points"
1477 echo "to some non-exist MDT-object, then the LFSCK should repair"
1478 echo "the OST-object to back point to the right MDT-object."
1481 check_mount_and_prep
1482 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1484 echo "Inject failure stub to make the OST-object to back point to"
1485 echo "non-exist MDT-object."
1486 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1488 do_facet ost1 $LCTL set_param fail_loc=0x1611
1489 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1490 cancel_lru_locks osc
1491 do_facet ost1 $LCTL set_param fail_loc=0
1493 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1494 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1497 mdd.${MDT_DEV}.lfsck_layout |
1498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1500 error "(2) unexpected status"
1503 local repaired=$($SHOW_LAYOUT |
1504 awk '/^repaired_unmatched_pair/ { print $2 }')
1505 [ $repaired -eq 1 ] ||
1506 error "(3) Fail to repair unmatched pair: $repaired"
1508 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1512 echo "If the OST-object referenced by the MDT-object back points"
1513 echo "to other MDT-object that doesn't recognize the OST-object,"
1514 echo "then the LFSCK should repair it to back point to the right"
1515 echo "MDT-object (the first one)."
1518 check_mount_and_prep
1519 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1520 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1521 cancel_lru_locks osc
1523 echo "Inject failure stub to make the OST-object to back point to"
1524 echo "other MDT-object"
1526 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1527 do_facet ost1 $LCTL set_param fail_loc=0x1612
1528 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1529 cancel_lru_locks osc
1530 do_facet ost1 $LCTL set_param fail_loc=0
1532 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1533 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1535 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1536 mdd.${MDT_DEV}.lfsck_layout |
1537 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1539 error "(2) unexpected status"
1542 local repaired=$($SHOW_LAYOUT |
1543 awk '/^repaired_unmatched_pair/ { print $2 }')
1544 [ $repaired -eq 1 ] ||
1545 error "(3) Fail to repair unmatched pair: $repaired"
1547 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1551 echo "If the OST-object's owner information does not match the owner"
1552 echo "information stored in the MDT-object, then the LFSCK trust the"
1553 echo "MDT-object and update the OST-object's owner information."
1556 check_mount_and_prep
1557 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1558 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1559 cancel_lru_locks osc
1561 echo "Inject failure stub to skip OST-object owner changing"
1562 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1564 chown 1.1 $DIR/$tdir/f0
1565 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1567 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1570 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1572 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1573 mdd.${MDT_DEV}.lfsck_layout |
1574 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1576 error "(2) unexpected status"
1579 local repaired=$($SHOW_LAYOUT |
1580 awk '/^repaired_inconsistent_owner/ { print $2 }')
1581 [ $repaired -eq 1 ] ||
1582 error "(3) Fail to repair inconsistent owner: $repaired"
1584 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1588 echo "If more than one MDT-objects reference the same OST-object,"
1589 echo "and the OST-object only recognizes one MDT-object, then the"
1590 echo "LFSCK should create new OST-objects for such non-recognized"
1594 check_mount_and_prep
1595 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1597 echo "Inject failure stub to make two MDT-objects to refernce"
1598 echo "the OST-object"
1600 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1601 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1603 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1604 cancel_lru_locks osc
1606 createmany -o $DIR/$tdir/f 1
1608 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1610 cancel_lru_locks mdc
1611 cancel_lru_locks osc
1613 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1614 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1615 [ $size -eq 1048576 ] ||
1616 error "(1) f0 (wrong) size should be 1048576, but got $size"
1618 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1621 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1623 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1624 mdd.${MDT_DEV}.lfsck_layout |
1625 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1627 error "(3) unexpected status"
1630 local repaired=$($SHOW_LAYOUT |
1631 awk '/^repaired_multiple_referenced/ { print $2 }')
1632 [ $repaired -eq 1 ] ||
1633 error "(4) Fail to repair multiple references: $repaired"
1635 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1636 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1637 error "(5) Fail to write f0."
1638 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1639 [ $size -eq 1048576 ] ||
1640 error "(6) guard size should be 1048576, but got $size"
1642 run_test 17 "LFSCK can repair multiple references"
1646 echo "The target MDT-object is there, but related stripe information"
1647 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1648 echo "layout EA entries."
1651 check_mount_and_prep
1652 $LFS mkdir -i 0 $DIR/$tdir/a1
1653 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1654 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1656 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1658 $LFS path2fid $DIR/$tdir/a1/f1
1659 $LFS getstripe $DIR/$tdir/a1/f1
1661 if [ $MDSCOUNT -ge 2 ]; then
1662 $LFS mkdir -i 1 $DIR/$tdir/a2
1663 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1664 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1665 $LFS path2fid $DIR/$tdir/a2/f2
1666 $LFS getstripe $DIR/$tdir/a2/f2
1669 cancel_lru_locks osc
1671 echo "Inject failure, to make the MDT-object lost its layout EA"
1672 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1673 do_facet mds1 $LCTL set_param fail_loc=0x1615
1674 chown 1.1 $DIR/$tdir/a1/f1
1676 if [ $MDSCOUNT -ge 2 ]; then
1677 do_facet mds2 $LCTL set_param fail_loc=0x1615
1678 chown 1.1 $DIR/$tdir/a2/f2
1684 do_facet mds1 $LCTL set_param fail_loc=0
1685 if [ $MDSCOUNT -ge 2 ]; then
1686 do_facet mds2 $LCTL set_param fail_loc=0
1689 cancel_lru_locks mdc
1690 cancel_lru_locks osc
1692 echo "The file size should be incorrect since layout EA is lost"
1693 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1694 [ "$cur_size" != "$saved_size" ] ||
1695 error "(1) Expect incorrect file1 size"
1697 if [ $MDSCOUNT -ge 2 ]; then
1698 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1699 [ "$cur_size" != "$saved_size" ] ||
1700 error "(2) Expect incorrect file2 size"
1703 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1704 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1706 for k in $(seq $MDSCOUNT); do
1707 # The LFSCK status query internal is 30 seconds. For the case
1708 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1709 # time to guarantee the status sync up.
1710 wait_update_facet mds${k} "$LCTL get_param -n \
1711 mdd.$(facet_svc mds${k}).lfsck_layout |
1712 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1713 error "(4) MDS${k} is not the expected 'completed'"
1716 for k in $(seq $OSTCOUNT); do
1717 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1718 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1719 awk '/^status/ { print $2 }')
1720 [ "$cur_status" == "completed" ] ||
1721 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1724 local repaired=$(do_facet mds1 $LCTL get_param -n \
1725 mdd.$(facet_svc mds1).lfsck_layout |
1726 awk '/^repaired_orphan/ { print $2 }')
1727 [ $repaired -eq 1 ] ||
1728 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1730 if [ $MDSCOUNT -ge 2 ]; then
1731 repaired=$(do_facet mds2 $LCTL get_param -n \
1732 mdd.$(facet_svc mds2).lfsck_layout |
1733 awk '/^repaired_orphan/ { print $2 }')
1734 [ $repaired -eq 2 ] ||
1735 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1738 $LFS path2fid $DIR/$tdir/a1/f1
1739 $LFS getstripe $DIR/$tdir/a1/f1
1741 if [ $MDSCOUNT -ge 2 ]; then
1742 $LFS path2fid $DIR/$tdir/a2/f2
1743 $LFS getstripe $DIR/$tdir/a2/f2
1746 echo "The file size should be correct after layout LFSCK scanning"
1747 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1748 [ "$cur_size" == "$saved_size" ] ||
1749 error "(7) Expect file1 size $saved_size, but got $cur_size"
1751 if [ $MDSCOUNT -ge 2 ]; then
1752 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1753 [ "$cur_size" == "$saved_size" ] ||
1754 error "(8) Expect file2 size $saved_size, but got $cur_size"
1757 run_test 18a "Find out orphan OST-object and repair it (1)"
1761 echo "The target MDT-object is lost. The LFSCK should re-create the"
1762 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1763 echo "can move it back to normal namespace manually."
1766 check_mount_and_prep
1767 $LFS mkdir -i 0 $DIR/$tdir/a1
1768 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1769 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1770 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1771 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1773 $LFS getstripe $DIR/$tdir/a1/f1
1775 if [ $MDSCOUNT -ge 2 ]; then
1776 $LFS mkdir -i 1 $DIR/$tdir/a2
1777 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1778 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1779 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1781 $LFS getstripe $DIR/$tdir/a2/f2
1784 cancel_lru_locks osc
1786 echo "Inject failure, to simulate the case of missing the MDT-object"
1787 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1788 do_facet mds1 $LCTL set_param fail_loc=0x1616
1789 rm -f $DIR/$tdir/a1/f1
1791 if [ $MDSCOUNT -ge 2 ]; then
1792 do_facet mds2 $LCTL set_param fail_loc=0x1616
1793 rm -f $DIR/$tdir/a2/f2
1799 do_facet mds1 $LCTL set_param fail_loc=0
1800 if [ $MDSCOUNT -ge 2 ]; then
1801 do_facet mds2 $LCTL set_param fail_loc=0
1804 cancel_lru_locks mdc
1805 cancel_lru_locks osc
1807 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1808 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1810 for k in $(seq $MDSCOUNT); do
1811 # The LFSCK status query internal is 30 seconds. For the case
1812 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1813 # time to guarantee the status sync up.
1814 wait_update_facet mds${k} "$LCTL get_param -n \
1815 mdd.$(facet_svc mds${k}).lfsck_layout |
1816 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1817 error "(2) MDS${k} is not the expected 'completed'"
1820 for k in $(seq $OSTCOUNT); do
1821 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1822 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1823 awk '/^status/ { print $2 }')
1824 [ "$cur_status" == "completed" ] ||
1825 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1828 local repaired=$(do_facet mds1 $LCTL get_param -n \
1829 mdd.$(facet_svc mds1).lfsck_layout |
1830 awk '/^repaired_orphan/ { print $2 }')
1831 [ $repaired -eq 1 ] ||
1832 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1834 if [ $MDSCOUNT -ge 2 ]; then
1835 repaired=$(do_facet mds2 $LCTL get_param -n \
1836 mdd.$(facet_svc mds2).lfsck_layout |
1837 awk '/^repaired_orphan/ { print $2 }')
1838 [ $repaired -eq 2 ] ||
1839 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1842 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1843 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1844 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1846 if [ $MDSCOUNT -ge 2 ]; then
1847 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1848 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1851 $LFS path2fid $DIR/$tdir/a1/f1
1852 $LFS getstripe $DIR/$tdir/a1/f1
1854 if [ $MDSCOUNT -ge 2 ]; then
1855 $LFS path2fid $DIR/$tdir/a2/f2
1856 $LFS getstripe $DIR/$tdir/a2/f2
1859 echo "The file size should be correct after layout LFSCK scanning"
1860 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1861 [ "$cur_size" == "$saved_size" ] ||
1862 error "(7) Expect file1 size $saved_size, but got $cur_size"
1864 if [ $MDSCOUNT -ge 2 ]; then
1865 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1866 [ "$cur_size" == "$saved_size" ] ||
1867 error "(8) Expect file2 size $saved_size, but got $cur_size"
1870 run_test 18b "Find out orphan OST-object and repair it (2)"
1874 echo "The target MDT-object is lost, and the OST-object FID is missing."
1875 echo "The LFSCK should re-create the MDT-object with new FID under the "
1876 echo "directory .lustre/lost+found/MDTxxxx."
1879 check_mount_and_prep
1880 $LFS mkdir -i 0 $DIR/$tdir/a1
1881 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1883 echo "Inject failure, to simulate the case of missing parent FID"
1884 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1885 do_facet ost1 $LCTL set_param fail_loc=0x1617
1887 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1888 $LFS getstripe $DIR/$tdir/a1/f1
1890 if [ $MDSCOUNT -ge 2 ]; then
1891 $LFS mkdir -i 1 $DIR/$tdir/a2
1892 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a2
1893 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1894 $LFS getstripe $DIR/$tdir/a2/f2
1897 cancel_lru_locks osc
1899 echo "Inject failure, to simulate the case of missing the MDT-object"
1900 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1901 do_facet mds1 $LCTL set_param fail_loc=0x1616
1902 rm -f $DIR/$tdir/a1/f1
1904 if [ $MDSCOUNT -ge 2 ]; then
1905 do_facet mds2 $LCTL set_param fail_loc=0x1616
1906 rm -f $DIR/$tdir/a2/f2
1912 do_facet mds1 $LCTL set_param fail_loc=0
1913 if [ $MDSCOUNT -ge 2 ]; then
1914 do_facet mds2 $LCTL set_param fail_loc=0
1917 cancel_lru_locks mdc
1918 cancel_lru_locks osc
1920 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1921 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1923 for k in $(seq $MDSCOUNT); do
1924 # The LFSCK status query internal is 30 seconds. For the case
1925 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1926 # time to guarantee the status sync up.
1927 wait_update_facet mds${k} "$LCTL get_param -n \
1928 mdd.$(facet_svc mds${k}).lfsck_layout |
1929 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1930 error "(2) MDS${k} is not the expected 'completed'"
1933 for k in $(seq $OSTCOUNT); do
1934 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1935 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1936 awk '/^status/ { print $2 }')
1937 [ "$cur_status" == "completed" ] ||
1938 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1941 if [ $MDSCOUNT -ge 2 ]; then
1947 local repaired=$(do_facet mds1 $LCTL get_param -n \
1948 mdd.$(facet_svc mds1).lfsck_layout |
1949 awk '/^repaired_orphan/ { print $2 }')
1950 [ $repaired -eq $expected ] ||
1951 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1953 if [ $MDSCOUNT -ge 2 ]; then
1954 repaired=$(do_facet mds2 $LCTL get_param -n \
1955 mdd.$(facet_svc mds2).lfsck_layout |
1956 awk '/^repaired_orphan/ { print $2 }')
1957 [ $repaired -eq 0 ] ||
1958 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1961 ls -ail $MOUNT/.lustre/lost+found/
1963 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1964 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
1965 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
1967 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1970 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1971 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
1972 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
1974 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
1975 [ ! -z "$cname" ] ||
1976 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
1978 run_test 18c "Find out orphan OST-object and repair it (3)"
1982 echo "The target MDT-object layout EA slot is occpuied by some new"
1983 echo "created OST-object when repair dangling reference case. Such"
1984 echo "conflict OST-object has never been modified. Then when found"
1985 echo "the orphan OST-object, LFSCK will replace it with the orphan"
1989 check_mount_and_prep
1991 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1992 echo "guard" > $DIR/$tdir/a1/f1
1993 echo "foo" > $DIR/$tdir/a1/f2
1994 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1995 $LFS path2fid $DIR/$tdir/a1/f1
1996 $LFS getstripe $DIR/$tdir/a1/f1
1997 $LFS path2fid $DIR/$tdir/a1/f2
1998 $LFS getstripe $DIR/$tdir/a1/f2
1999 cancel_lru_locks osc
2001 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2002 echo "to reference the same OST-object (which is f1's OST-obejct)."
2003 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2004 echo "dangling reference case, but f2's old OST-object is there."
2007 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2009 chown 1.1 $DIR/$tdir/a1/f2
2010 rm -f $DIR/$tdir/a1/f1
2013 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2015 echo "stopall to cleanup object cache"
2018 setupall > /dev/null
2020 echo "The file size should be incorrect since dangling referenced"
2021 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2022 [ "$cur_size" != "$saved_size" ] ||
2023 error "(1) Expect incorrect file2 size"
2025 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2026 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2028 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2029 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2031 wait_update_facet mds1 "$LCTL get_param -n \
2032 mdd.$(facet_svc mds1).lfsck_layout |
2033 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2034 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2036 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2038 for k in $(seq $MDSCOUNT); do
2039 # The LFSCK status query internal is 30 seconds. For the case
2040 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2041 # time to guarantee the status sync up.
2042 wait_update_facet mds${k} "$LCTL get_param -n \
2043 mdd.$(facet_svc mds${k}).lfsck_layout |
2044 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2045 error "(3) MDS${k} is not the expected 'completed'"
2048 for k in $(seq $OSTCOUNT); do
2049 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2050 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2051 awk '/^status/ { print $2 }')
2052 [ "$cur_status" == "completed" ] ||
2053 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2056 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2057 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2058 awk '/^repaired_orphan/ { print $2 }')
2059 [ $repaired -eq 1 ] ||
2060 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2062 echo "The file size should be correct after layout LFSCK scanning"
2063 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2064 [ "$cur_size" == "$saved_size" ] ||
2065 error "(6) Expect file2 size $saved_size, but got $cur_size"
2067 echo "The LFSCK should find back the original data."
2068 cat $DIR/$tdir/a1/f2
2069 $LFS path2fid $DIR/$tdir/a1/f2
2070 $LFS getstripe $DIR/$tdir/a1/f2
2072 run_test 18d "Find out orphan OST-object and repair it (4)"
2076 echo "The target MDT-object layout EA slot is occpuied by some new"
2077 echo "created OST-object when repair dangling reference case. Such"
2078 echo "conflict OST-object has been modified by others. To keep the"
2079 echo "new data, the LFSCK will create a new file to refernece this"
2080 echo "old orphan OST-object."
2083 check_mount_and_prep
2085 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2086 echo "guard" > $DIR/$tdir/a1/f1
2087 echo "foo" > $DIR/$tdir/a1/f2
2088 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2089 $LFS path2fid $DIR/$tdir/a1/f1
2090 $LFS getstripe $DIR/$tdir/a1/f1
2091 $LFS path2fid $DIR/$tdir/a1/f2
2092 $LFS getstripe $DIR/$tdir/a1/f2
2093 cancel_lru_locks osc
2095 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2096 echo "to reference the same OST-object (which is f1's OST-obejct)."
2097 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2098 echo "dangling reference case, but f2's old OST-object is there."
2101 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2102 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2103 chown 1.1 $DIR/$tdir/a1/f2
2104 rm -f $DIR/$tdir/a1/f1
2107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2109 echo "stopall to cleanup object cache"
2112 setupall > /dev/null
2114 echo "The file size should be incorrect since dangling referenced"
2115 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2116 [ "$cur_size" != "$saved_size" ] ||
2117 error "(1) Expect incorrect file2 size"
2119 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2120 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2122 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2123 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2125 wait_update_facet mds1 "$LCTL get_param -n \
2126 mdd.$(facet_svc mds1).lfsck_layout |
2127 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2128 error "(3) MDS1 is not the expected 'scanning-phase2'"
2130 # to guarantee all updates are synced.
2134 echo "Write new data to f2 to modify the new created OST-object."
2135 echo "dummy" >> $DIR/$tdir/a1/f2
2137 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2139 for k in $(seq $MDSCOUNT); do
2140 # The LFSCK status query internal is 30 seconds. For the case
2141 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2142 # time to guarantee the status sync up.
2143 wait_update_facet mds${k} "$LCTL get_param -n \
2144 mdd.$(facet_svc mds${k}).lfsck_layout |
2145 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2146 error "(4) MDS${k} is not the expected 'completed'"
2149 for k in $(seq $OSTCOUNT); do
2150 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2151 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2152 awk '/^status/ { print $2 }')
2153 [ "$cur_status" == "completed" ] ||
2154 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2157 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2158 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2159 awk '/^repaired_orphan/ { print $2 }')
2160 [ $repaired -eq 1 ] ||
2161 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2163 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2164 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2165 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2167 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2168 [ ! -z "$cname" ] ||
2169 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2171 echo "The stub file should keep the original f2 data"
2172 cur_size=$(ls -il $cname | awk '{ print $6 }')
2173 [ "$cur_size" == "$saved_size" ] ||
2174 error "(9) Expect file2 size $saved_size, but got $cur_size"
2177 $LFS path2fid $cname
2178 $LFS getstripe $cname
2180 echo "The f2 should contains new data."
2181 cat $DIR/$tdir/a1/f2
2182 $LFS path2fid $DIR/$tdir/a1/f2
2183 $LFS getstripe $DIR/$tdir/a1/f2
2185 run_test 18e "Find out orphan OST-object and repair it (5)"
2188 [ $OSTCOUNT -lt 2 ] &&
2189 skip "The test needs at least 2 OSTs" && return
2192 echo "The target MDT-object is lost. The LFSCK should re-create the"
2193 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2194 echo "to verify some OST-object(s) during the first stage-scanning,"
2195 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2196 echo "should not be affected."
2199 check_mount_and_prep
2200 $LFS mkdir -i 0 $DIR/$tdir/a1
2201 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2202 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2203 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2204 $LFS mkdir -i 0 $DIR/$tdir/a2
2205 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a2
2206 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2207 $LFS getstripe $DIR/$tdir/a1/f1
2208 $LFS getstripe $DIR/$tdir/a2/f2
2210 if [ $MDSCOUNT -ge 2 ]; then
2211 $LFS mkdir -i 1 $DIR/$tdir/a3
2212 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a3
2213 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2214 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2215 $LFS mkdir -i 1 $DIR/$tdir/a4
2216 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a4
2217 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2218 $LFS getstripe $DIR/$tdir/a3/f3
2219 $LFS getstripe $DIR/$tdir/a4/f4
2222 cancel_lru_locks osc
2224 echo "Inject failure, to simulate the case of missing the MDT-object"
2225 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2226 do_facet mds1 $LCTL set_param fail_loc=0x1616
2227 rm -f $DIR/$tdir/a1/f1
2228 rm -f $DIR/$tdir/a2/f2
2230 if [ $MDSCOUNT -ge 2 ]; then
2231 do_facet mds2 $LCTL set_param fail_loc=0x1616
2232 rm -f $DIR/$tdir/a3/f3
2233 rm -f $DIR/$tdir/a4/f4
2239 do_facet mds1 $LCTL set_param fail_loc=0
2240 if [ $MDSCOUNT -ge 2 ]; then
2241 do_facet mds2 $LCTL set_param fail_loc=0
2244 cancel_lru_locks mdc
2245 cancel_lru_locks osc
2247 echo "Inject failure, to simulate the OST0 fail to handle"
2248 echo "MDT0 LFSCK request during the first-stage scanning."
2249 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2250 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2252 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2253 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2255 for k in $(seq $MDSCOUNT); do
2256 # The LFSCK status query internal is 30 seconds. For the case
2257 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2258 # time to guarantee the status sync up.
2259 wait_update_facet mds${k} "$LCTL get_param -n \
2260 mdd.$(facet_svc mds${k}).lfsck_layout |
2261 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2262 error "(2) MDS${k} is not the expected 'partial'"
2265 wait_update_facet ost1 "$LCTL get_param -n \
2266 obdfilter.$(facet_svc ost1).lfsck_layout |
2267 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2268 error "(3) OST1 is not the expected 'partial'"
2271 wait_update_facet ost2 "$LCTL get_param -n \
2272 obdfilter.$(facet_svc ost2).lfsck_layout |
2273 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2274 error "(4) OST2 is not the expected 'completed'"
2277 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2279 local repaired=$(do_facet mds1 $LCTL get_param -n \
2280 mdd.$(facet_svc mds1).lfsck_layout |
2281 awk '/^repaired_orphan/ { print $2 }')
2282 [ $repaired -eq 1 ] ||
2283 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2285 if [ $MDSCOUNT -ge 2 ]; then
2286 repaired=$(do_facet mds2 $LCTL get_param -n \
2287 mdd.$(facet_svc mds2).lfsck_layout |
2288 awk '/^repaired_orphan/ { print $2 }')
2289 [ $repaired -eq 1 ] ||
2290 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2293 echo "Trigger layout LFSCK on all devices again to cleanup"
2294 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2296 for k in $(seq $MDSCOUNT); do
2297 # The LFSCK status query internal is 30 seconds. For the case
2298 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2299 # time to guarantee the status sync up.
2300 wait_update_facet mds${k} "$LCTL get_param -n \
2301 mdd.$(facet_svc mds${k}).lfsck_layout |
2302 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2303 error "(8) MDS${k} is not the expected 'completed'"
2306 for k in $(seq $OSTCOUNT); do
2307 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2308 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2309 awk '/^status/ { print $2 }')
2310 [ "$cur_status" == "completed" ] ||
2311 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2315 local repaired=$(do_facet mds1 $LCTL get_param -n \
2316 mdd.$(facet_svc mds1).lfsck_layout |
2317 awk '/^repaired_orphan/ { print $2 }')
2318 [ $repaired -eq 2 ] ||
2319 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2321 if [ $MDSCOUNT -ge 2 ]; then
2322 repaired=$(do_facet mds2 $LCTL get_param -n \
2323 mdd.$(facet_svc mds2).lfsck_layout |
2324 awk '/^repaired_orphan/ { print $2 }')
2325 [ $repaired -eq 2 ] ||
2326 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2329 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2332 check_mount_and_prep
2333 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2335 echo "foo" > $DIR/$tdir/a0
2336 echo "guard" > $DIR/$tdir/a1
2337 cancel_lru_locks osc
2339 echo "Inject failure, then client will offer wrong parent FID when read"
2340 do_facet ost1 $LCTL set_param -n \
2341 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2342 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2343 $LCTL set_param fail_loc=0x1619
2345 echo "Read RPC with wrong parent FID should be denied"
2346 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2347 $LCTL set_param fail_loc=0
2349 run_test 19a "OST-object inconsistency self detect"
2352 check_mount_and_prep
2353 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2355 echo "Inject failure stub to make the OST-object to back point to"
2356 echo "non-exist MDT-object"
2358 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2359 do_facet ost1 $LCTL set_param fail_loc=0x1611
2360 echo "foo" > $DIR/$tdir/f0
2361 cancel_lru_locks osc
2362 do_facet ost1 $LCTL set_param fail_loc=0
2364 echo "Nothing should be fixed since self detect and repair is disabled"
2365 local repaired=$(do_facet ost1 $LCTL get_param -n \
2366 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2367 awk '/^repaired/ { print $2 }')
2368 [ $repaired -eq 0 ] ||
2369 error "(1) Expected 0 repaired, but got $repaired"
2371 echo "Read RPC with right parent FID should be accepted,"
2372 echo "and cause parent FID on OST to be fixed"
2374 do_facet ost1 $LCTL set_param -n \
2375 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2376 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2378 repaired=$(do_facet ost1 $LCTL get_param -n \
2379 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2380 awk '/^repaired/ { print $2 }')
2381 [ $repaired -eq 1 ] ||
2382 error "(3) Expected 1 repaired, but got $repaired"
2384 run_test 19b "OST-object inconsistency self repair"
2387 [ $OSTCOUNT -lt 2 ] &&
2388 skip "The test needs at least 2 OSTs" && return
2391 echo "The target MDT-object and some of its OST-object are lost."
2392 echo "The LFSCK should find out the left OST-objects and re-create"
2393 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2394 echo "with the partial OST-objects (LOV EA hole)."
2396 echo "New client can access the file with LOV EA hole via normal"
2397 echo "system tools or commands without crash the system."
2399 echo "For old client, even though it cannot access the file with"
2400 echo "LOV EA hole, it should not cause the system crash."
2403 check_mount_and_prep
2404 $LFS mkdir -i 0 $DIR/$tdir/a1
2405 if [ $OSTCOUNT -gt 2 ]; then
2406 $LFS setstripe -c 3 -i 0 -s 1M $DIR/$tdir/a1
2409 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a1
2413 # 256 blocks on the stripe0.
2414 # 1 block on the stripe1 for 2 OSTs case.
2415 # 256 blocks on the stripe1 for other cases.
2416 # 1 block on the stripe2 if OSTs > 2
2417 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2418 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2419 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2421 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2422 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2423 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2426 $LFS getstripe $DIR/$tdir/a1/f0
2428 $LFS getstripe $DIR/$tdir/a1/f1
2430 $LFS getstripe $DIR/$tdir/a1/f2
2432 if [ $OSTCOUNT -gt 2 ]; then
2433 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2434 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2436 $LFS getstripe $DIR/$tdir/a1/f3
2439 cancel_lru_locks osc
2441 echo "Inject failure..."
2442 echo "To simulate f0 lost MDT-object"
2443 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2444 do_facet mds1 $LCTL set_param fail_loc=0x1616
2445 rm -f $DIR/$tdir/a1/f0
2447 echo "To simulate f1 lost MDT-object and OST-object0"
2448 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2449 do_facet mds1 $LCTL set_param fail_loc=0x161a
2450 rm -f $DIR/$tdir/a1/f1
2452 echo "To simulate f2 lost MDT-object and OST-object1"
2453 do_facet mds1 $LCTL set_param fail_val=1
2454 rm -f $DIR/$tdir/a1/f2
2456 if [ $OSTCOUNT -gt 2 ]; then
2457 echo "To simulate f3 lost MDT-object and OST-object2"
2458 do_facet mds1 $LCTL set_param fail_val=2
2459 rm -f $DIR/$tdir/a1/f3
2462 umount_client $MOUNT
2465 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2467 echo "Inject failure to slow down the LFSCK on OST0"
2468 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2469 do_facet ost1 $LCTL set_param fail_loc=0x161b
2471 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2472 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2475 do_facet ost1 $LCTL set_param fail_loc=0
2477 for k in $(seq $MDSCOUNT); do
2478 # The LFSCK status query internal is 30 seconds. For the case
2479 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2480 # time to guarantee the status sync up.
2481 wait_update_facet mds${k} "$LCTL get_param -n \
2482 mdd.$(facet_svc mds${k}).lfsck_layout |
2483 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2484 error "(2) MDS${k} is not the expected 'completed'"
2487 for k in $(seq $OSTCOUNT); do
2488 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2489 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2490 awk '/^status/ { print $2 }')
2491 [ "$cur_status" == "completed" ] ||
2492 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2495 local repaired=$(do_facet mds1 $LCTL get_param -n \
2496 mdd.$(facet_svc mds1).lfsck_layout |
2497 awk '/^repaired_orphan/ { print $2 }')
2498 if [ $OSTCOUNT -gt 2 ]; then
2499 [ $repaired -eq 9 ] ||
2500 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2502 [ $repaired -eq 4 ] ||
2503 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2506 mount_client $MOUNT || error "(5.0) Fail to start client!"
2508 LOV_PATTERN_F_HOLE=0x40000000
2511 # ${fid0}-R-0 is the old f0
2513 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2514 echo "Check $name, which is the old f0"
2516 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2518 local pattern=0x$($LFS getstripe -L $name)
2519 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2520 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2522 local stripes=$($LFS getstripe -c $name)
2523 if [ $OSTCOUNT -gt 2 ]; then
2524 [ $stripes -eq 3 ] ||
2525 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2527 [ $stripes -eq 2 ] ||
2528 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2531 local size=$(stat $name | awk '/Size:/ { print $2 }')
2532 [ $size -eq $((4096 * $bcount)) ] ||
2533 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2535 cat $name > /dev/null || error "(5.5) cannot read $name"
2537 echo "dummy" >> $name || error "(5.6) cannot write $name"
2539 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2541 touch $name || error "(5.8) cannot touch $name"
2543 rm -f $name || error "(5.9) cannot unlink $name"
2546 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2548 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2549 if [ $OSTCOUNT -gt 2 ]; then
2550 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2552 echo "Check $name, it contains the old f1's stripe1"
2555 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2557 pattern=0x$($LFS getstripe -L $name)
2558 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2559 error "(6.2) expect pattern flag hole, but got $pattern"
2561 stripes=$($LFS getstripe -c $name)
2562 if [ $OSTCOUNT -gt 2 ]; then
2563 [ $stripes -eq 3 ] ||
2564 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2566 [ $stripes -eq 2 ] ||
2567 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2570 size=$(stat $name | awk '/Size:/ { print $2 }')
2571 [ $size -eq $((4096 * $bcount)) ] ||
2572 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2574 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2576 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2577 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2580 [ $failures -eq 256 ] ||
2581 error "(6.6) expect 256 IO failures, but get $failures"
2583 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2584 [ $size -eq $((4096 * $bcount)) ] ||
2585 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2587 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2588 error "(6.8) write to the LOV EA hole should fail"
2590 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2591 error "(6.9) write to normal stripe should NOT fail"
2593 echo "foo" >> $name && error "(6.10) append write $name should fail"
2595 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2597 touch $name || error "(6.12) cannot touch $name"
2599 rm -f $name || error "(6.13) cannot unlink $name"
2602 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2604 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2605 if [ $OSTCOUNT -gt 2 ]; then
2606 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2608 echo "Check $name, it contains the old f2's stripe0"
2611 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2613 pattern=0x$($LFS getstripe -L $name)
2614 stripes=$($LFS getstripe -c $name)
2615 size=$(stat $name | awk '/Size:/ { print $2 }')
2616 if [ $OSTCOUNT -gt 2 ]; then
2617 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2618 error "(7.2.1) expect pattern flag hole, but got $pattern"
2620 [ $stripes -eq 3 ] ||
2621 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2623 [ $size -eq $((4096 * $bcount)) ] ||
2624 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2626 cat $name > /dev/null &&
2627 error "(7.5.1) normal read $name should fail"
2629 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2630 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2632 [ $failures -eq 256 ] ||
2633 error "(7.6) expect 256 IO failures, but get $failures"
2635 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2636 [ $size -eq $((4096 * $bcount)) ] ||
2637 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2639 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2640 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2642 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2643 error "(7.8.1) write to normal stripe should NOT fail"
2645 echo "foo" >> $name &&
2646 error "(7.8.3) append write $name should fail"
2648 chown $RUNAS_ID:$RUNAS_GID $name ||
2649 error "(7.9.1) cannot chown on $name"
2651 touch $name || error "(7.10.1) cannot touch $name"
2653 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2654 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2656 [ $stripes -eq 1 ] ||
2657 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2660 [ $size -eq $((4096 * (256 + 0))) ] ||
2661 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2663 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2665 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2667 chown $RUNAS_ID:$RUNAS_GID $name ||
2668 error "(7.9.2) cannot chown on $name"
2670 touch $name || error "(7.10.2) cannot touch $name"
2673 rm -f $name || error "(7.11) cannot unlink $name"
2675 [ $OSTCOUNT -le 2 ] && return
2678 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2680 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2681 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2683 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2685 pattern=0x$($LFS getstripe -L $name)
2686 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2687 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2689 stripes=$($LFS getstripe -c $name)
2690 # LFSCK does not know the old f3 had 3 stripes.
2691 # It only tries to find as much as possible.
2692 # The stripe count depends on the last stripe's offset.
2693 [ $stripes -eq 2 ] ||
2694 error "(8.3) expect the stripe count is 2, but got $stripes"
2696 size=$(stat $name | awk '/Size:/ { print $2 }')
2698 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2699 error "(8.4) expect the size $((4096 * 512)), but got $size"
2701 cat $name > /dev/null || error "(8.5) cannot read $name"
2703 echo "dummy" >> $name || error "(8.6) cannot write $name"
2705 chown $RUNAS_ID:$RUNAS_GID $name ||
2706 error "(8.7) cannot chown on $name"
2708 touch $name || error "(8.8) cannot touch $name"
2710 rm -f $name || error "(8.9) cannot unlink $name"
2712 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2715 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2716 skip "ignore the test if MDS is older than 2.5.59" && return
2718 check_mount_and_prep
2719 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2721 echo "Start all LFSCK components by default (-s 1)"
2722 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2723 error "Fail to start LFSCK"
2725 echo "namespace LFSCK should be in 'scanning-phase1' status"
2726 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2727 [ "$STATUS" == "scanning-phase1" ] ||
2728 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2730 echo "layout LFSCK should be in 'scanning-phase1' status"
2731 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2732 [ "$STATUS" == "scanning-phase1" ] ||
2733 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2735 echo "Stop all LFSCK components by default"
2736 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2737 error "Fail to stop LFSCK"
2739 run_test 21 "run all LFSCK components by default"
2742 [ $MDSCOUNT -lt 2 ] &&
2743 skip "We need at least 2 MDSes for this test" && return
2746 echo "The parent_A references the child directory via some name entry,"
2747 echo "but the child directory back references another parent_B via its"
2748 echo "".." name entry. The parent_A does not exist. Then the namesapce"
2749 echo "LFSCK will repair the child directory's ".." name entry."
2752 check_mount_and_prep
2754 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2755 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2757 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2758 echo "The dummy's dotdot name entry references the guard."
2759 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2760 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2761 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2762 error "(3) Fail to mkdir on MDT0"
2763 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2765 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2767 echo "Trigger namespace LFSCK to repair unmatched pairs"
2768 $START_NAMESPACE -A -r ||
2769 error "(5) Fail to start LFSCK for namespace"
2771 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2772 mdd.${MDT_DEV}.lfsck_namespace |
2773 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2775 error "(6) unexpected status"
2778 local repaired=$($SHOW_NAMESPACE |
2779 awk '/^unmatched_pairs_repaired/ { print $2 }')
2780 [ $repaired -eq 1 ] ||
2781 error "(7) Fail to repair unmatched pairs: $repaired"
2783 echo "'ls' should success after namespace LFSCK repairing"
2784 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2785 error "(8) ls should success."
2787 run_test 22a "LFSCK can repair unmatched pairs (1)"
2790 [ $MDSCOUNT -lt 2 ] &&
2791 skip "We need at least 2 MDSes for this test" && return
2794 echo "The parent_A references the child directory via the name entry_B,"
2795 echo "but the child directory back references another parent_C via its"
2796 echo "".." name entry. The parent_C exists, but there is no the name"
2797 echo "entry_B under the parent_B. Then the namesapce LFSCK will repair"
2798 echo "the child directory's ".." name entry and its linkEA."
2801 check_mount_and_prep
2803 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2804 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2806 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2807 echo "and bad linkEA. The dummy's dotdot name entry references the"
2808 echo "guard. The dummy's linkEA references n non-exist name entry."
2809 #define OBD_FAIL_LFSCK_BAD_PARENT2 0x161f
2810 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161f
2811 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2812 error "(3) Fail to mkdir on MDT0"
2813 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2815 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2816 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2817 local dummyname=$($LFS fid2path $DIR $dummyfid)
2818 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2819 error "(4) fid2path works unexpectedly."
2821 echo "Trigger namespace LFSCK to repair unmatched pairs"
2822 $START_NAMESPACE -A -r ||
2823 error "(5) Fail to start LFSCK for namespace"
2825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2826 mdd.${MDT_DEV}.lfsck_namespace |
2827 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2829 error "(6) unexpected status"
2832 local repaired=$($SHOW_NAMESPACE |
2833 awk '/^unmatched_pairs_repaired/ { print $2 }')
2834 [ $repaired -eq 1 ] ||
2835 error "(7) Fail to repair unmatched pairs: $repaired"
2837 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2838 local dummyname=$($LFS fid2path $DIR $dummyfid)
2839 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2840 error "(8) fid2path does not work"
2842 run_test 22b "LFSCK can repair unmatched pairs (2)"
2845 [ $MDSCOUNT -lt 2 ] &&
2846 skip "We need at least 2 MDSes for this test" && return
2849 echo "The name entry is there, but the MDT-object for such name "
2850 echo "entry does not exist. The namespace LFSCK should find out "
2851 echo "and repair the inconsistency as required."
2854 check_mount_and_prep
2856 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2857 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2859 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2860 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2861 do_facet mds2 $LCTL set_param fail_loc=0x1620
2862 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2863 do_facet mds2 $LCTL set_param fail_loc=0
2865 echo "'ls' should fail because of dangling name entry"
2866 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2868 echo "Trigger namespace LFSCK to find out dangling name entry"
2869 $START_NAMESPACE -A -r ||
2870 error "(5) Fail to start LFSCK for namespace"
2872 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2873 mdd.${MDT_DEV}.lfsck_namespace |
2874 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2876 error "(6) unexpected status"
2879 local repaired=$($SHOW_NAMESPACE |
2880 awk '/^dangling_repaired/ { print $2 }')
2881 [ $repaired -eq 1 ] ||
2882 error "(7) Fail to repair dangling name entry: $repaired"
2884 echo "'ls' should fail because not re-create MDT-object by default"
2885 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2887 echo "Trigger namespace LFSCK again to repair dangling name entry"
2888 $START_NAMESPACE -A -r -C ||
2889 error "(9) Fail to start LFSCK for namespace"
2891 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2892 mdd.${MDT_DEV}.lfsck_namespace |
2893 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2895 error "(10) unexpected status"
2898 repaired=$($SHOW_NAMESPACE |
2899 awk '/^dangling_repaired/ { print $2 }')
2900 [ $repaired -eq 1 ] ||
2901 error "(11) Fail to repair dangling name entry: $repaired"
2903 echo "'ls' should success after namespace LFSCK repairing"
2904 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
2906 run_test 23a "LFSCK can repair dangling name entry (1)"
2910 echo "The objectA has multiple hard links, one of them corresponding"
2911 echo "to the name entry_B. But there is something wrong for the name"
2912 echo "entry_B and cause entry_B to references non-exist object_C."
2913 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2914 echo "as dangling, and re-create the lost object_C. When the LFSCK"
2915 echo "comes to the second-stage scanning, it will find that the"
2916 echo "former re-creating object_C is not proper, and will try to"
2917 echo "replace the object_C with the real object_A."
2920 check_mount_and_prep
2922 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2923 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2924 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2926 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2927 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2928 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2929 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2932 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2934 echo "'ls' should fail because of dangling name entry"
2935 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2936 error "(6) ls should fail."
2938 echo "Trigger namespace LFSCK to find out dangling name entry"
2939 $START_NAMESPACE -r -C ||
2940 error "(7) Fail to start LFSCK for namespace"
2942 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2943 mdd.${MDT_DEV}.lfsck_namespace |
2944 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2946 error "(8) unexpected status"
2949 local repaired=$($SHOW_NAMESPACE |
2950 awk '/^dangling_repaired/ { print $2 }')
2951 [ $repaired -eq 1 ] ||
2952 error "(9) Fail to repair dangling name entry: $repaired"
2954 repaired=$($SHOW_NAMESPACE |
2955 awk '/^multiple_linked_repaired/ { print $2 }')
2956 [ $repaired -eq 1 ] ||
2957 error "(10) Fail to drop the former created object: $repaired"
2959 local data=$(cat $DIR/$tdir/d0/foo)
2960 [ "$data" == "dummy" ] ||
2961 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
2963 run_test 23b "LFSCK can repair dangling name entry (2)"
2967 echo "The objectA has multiple hard links, one of them corresponding"
2968 echo "to the name entry_B. But there is something wrong for the name"
2969 echo "entry_B and cause entry_B to references non-exist object_C."
2970 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2971 echo "as dangling, and re-create the lost object_C. And then others"
2972 echo "modified the re-created object_C. When the LFSCK comes to the"
2973 echo "second-stage scanning, it will find that the former re-creating"
2974 echo "object_C maybe wrong and try to replace the object_C with the"
2975 echo "real object_A. But because object_C has been modified, so the"
2976 echo "LFSCK cannot replace it."
2979 check_mount_and_prep
2981 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2982 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2983 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2985 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2986 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2987 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2988 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2989 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2991 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2993 echo "'ls' should fail because of dangling name entry"
2994 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2995 error "(6) ls should fail."
2997 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2998 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3000 echo "Trigger namespace LFSCK to find out dangling name entry"
3001 $START_NAMESPACE -r -C ||
3002 error "(7) Fail to start LFSCK for namespace"
3004 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3005 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3006 stat $DIR/$tdir/guard
3008 error "(8) unexpected size"
3011 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3012 cancel_lru_locks osc
3014 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3015 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3016 mdd.${MDT_DEV}.lfsck_namespace |
3017 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3019 error "(10) unexpected status"
3022 local repaired=$($SHOW_NAMESPACE |
3023 awk '/^dangling_repaired/ { print $2 }')
3024 [ $repaired -eq 1 ] ||
3025 error "(11) Fail to repair dangling name entry: $repaired"
3027 local data=$(cat $DIR/$tdir/d0/foo)
3028 [ "$data" != "dummy" ] ||
3029 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3031 run_test 23c "LFSCK can repair dangling name entry (3)"
3034 [ $MDSCOUNT -lt 2 ] &&
3035 skip "We need at least 2 MDSes for this test" && return
3038 echo "Two MDT-objects back reference the same name entry via their"
3039 echo "each own linkEA entry, but the name entry only references one"
3040 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3041 echo "for the MDT-object that is not recognized. If such MDT-object"
3042 echo "has no other linkEA entry after the removing, then the LFSCK"
3043 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3046 check_mount_and_prep
3048 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3050 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3051 $LFS path2fid $DIR/$tdir/d0/guard
3053 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3054 $LFS path2fid $DIR/$tdir/d0/dummy
3055 local pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3057 touch $DIR/$tdir/d0/guard/foo ||
3058 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3060 echo "Inject failure stub on MDT0 to simulate the case that"
3061 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3062 echo "that references $DIR/$tdir/d0/guard/foo."
3063 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3064 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3065 echo "there with the same linkEA entry as another MDT-object"
3066 echo "$DIR/$tdir/d0/guard/foo has"
3068 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3069 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3070 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3071 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3072 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3073 rmdir $DIR/$tdir/d0/dummy/foo ||
3074 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3075 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3077 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3078 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3079 error "(6) stat successfully unexpectedly"
3081 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3082 $START_NAMESPACE -A -r ||
3083 error "(7) Fail to start LFSCK for namespace"
3085 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3086 mdd.${MDT_DEV}.lfsck_namespace |
3087 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3089 error "(8) unexpected status"
3092 local repaired=$($SHOW_NAMESPACE |
3093 awk '/^multiple_referenced_repaired/ { print $2 }')
3094 [ $repaired -eq 1 ] ||
3095 error "(9) Fail to repair multiple referenced name entry: $repaired"
3097 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3098 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3099 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3101 local cname="$cfid-$pfid-D-0"
3102 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3103 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3105 run_test 24 "LFSCK can repair multiple-referenced name entry"
3108 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3109 skip "Only support to inject failure on ldiskfs" && return
3112 echo "The file type in the name entry does not match the file type"
3113 echo "claimed by the referenced object. Then the LFSCK will update"
3114 echo "the file type in the name entry."
3117 check_mount_and_prep
3119 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3121 echo "Inject failure stub on MDT0 to simulate the case that"
3122 echo "the file type stored in the name entry is wrong."
3124 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3125 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3126 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3127 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3129 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3130 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3132 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3133 mdd.${MDT_DEV}.lfsck_namespace |
3134 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3136 error "(4) unexpected status"
3139 local repaired=$($SHOW_NAMESPACE |
3140 awk '/^bad_file_type_repaired/ { print $2 }')
3141 [ $repaired -eq 1 ] ||
3142 error "(5) Fail to repair bad file type in name entry: $repaired"
3144 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3146 run_test 25 "LFSCK can repair bad file type in the name entry"
3150 echo "The local name entry back referenced by the MDT-object is lost."
3151 echo "The namespace LFSCK will add the missing local name entry back"
3152 echo "to the normal namespace."
3155 check_mount_and_prep
3157 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3158 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3159 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3161 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3162 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3164 echo "Inject failure stub on MDT0 to simulate the case that"
3165 echo "foo's name entry will be removed, but the foo's object"
3166 echo "and its linkEA are kept in the system."
3168 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3170 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3173 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3175 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3176 $START_NAMESPACE -r -A ||
3177 error "(6) Fail to start LFSCK for namespace"
3179 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3180 mdd.${MDT_DEV}.lfsck_namespace |
3181 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3183 error "(7) unexpected status"
3186 local repaired=$($SHOW_NAMESPACE |
3187 awk '/^lost_dirent_repaired/ { print $2 }')
3188 [ $repaired -eq 1 ] ||
3189 error "(8) Fail to repair lost dirent: $repaired"
3191 ls -ail $DIR/$tdir/d0/foo ||
3192 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3194 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3195 [ "$foofid" == "$foofid2" ] ||
3196 error "(10) foo's FID changed: $foofid, $foofid2"
3198 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3201 [ $MDSCOUNT -lt 2 ] &&
3202 skip "We need at least 2 MDSes for this test" && return
3205 echo "The remote name entry back referenced by the MDT-object is lost."
3206 echo "The namespace LFSCK will add the missing remote name entry back"
3207 echo "to the normal namespace."
3210 check_mount_and_prep
3212 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3213 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3214 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3216 echo "Inject failure stub on MDT0 to simulate the case that"
3217 echo "foo's name entry will be removed, but the foo's object"
3218 echo "and its linkEA are kept in the system."
3220 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3221 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3222 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3225 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3227 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3228 $START_NAMESPACE -r -A ||
3229 error "(5) Fail to start LFSCK for namespace"
3231 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3232 mdd.${MDT_DEV}.lfsck_namespace |
3233 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3235 error "(6) unexpected status"
3238 local repaired=$($SHOW_NAMESPACE |
3239 awk '/^lost_dirent_repaired/ { print $2 }')
3240 [ $repaired -eq 1 ] ||
3241 error "(7) Fail to repair lost dirent: $repaired"
3243 ls -ail $DIR/$tdir/d0/foo ||
3244 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3246 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3247 [ "$foofid" == "$foofid2" ] ||
3248 error "(9) foo's FID changed: $foofid, $foofid2"
3250 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3252 $LCTL set_param debug=-lfsck > /dev/null || true
3254 # restore MDS/OST size
3255 MDSSIZE=${SAVED_MDSSIZE}
3256 OSTSIZE=${SAVED_OSTSIZE}
3257 OSTCOUNT=${SAVED_OSTCOUNT}
3259 # cleanup the system at last