3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
24 SAVED_MDSSIZE=${MDSSIZE}
25 SAVED_OSTSIZE=${OSTSIZE}
26 SAVED_OSTCOUNT=${OSTCOUNT}
27 # use small MDS + OST size to speed formatting time
28 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
31 # no need too much OSTs, to reduce the format/start/stop overhead
32 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
34 # build up a clean test environment.
38 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
39 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
42 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
43 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
45 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
51 # DNE does not support striped directory on zfs-based backend yet.
52 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
53 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
57 $LCTL set_param debug=+lfsck > /dev/null || true
59 MDT_DEV="${FSNAME}-MDT0000"
60 OST_DEV="${FSNAME}-OST0000"
61 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
62 START_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
64 START_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
66 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
67 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
68 SHOW_NAMESPACE="do_facet $SINGLEMDS \
69 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
70 SHOW_LAYOUT="do_facet $SINGLEMDS \
71 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
72 SHOW_LAYOUT_ON_OST="do_facet ost1 \
73 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
74 MOUNT_OPTS_SCRUB="-o user_xattr"
75 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
84 echo "preparing... $nfiles * $ndirs files will be created $(date)."
85 if [ ! -z $igif ]; then
86 #define OBD_FAIL_FID_IGIF 0x1504
87 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
90 cp $LUSTRE/tests/*.sh $DIR/$tdir/
91 if [ $ndirs -gt 0 ]; then
92 createmany -d $DIR/$tdir/d $ndirs
93 createmany -m $DIR/$tdir/f $ndirs
94 if [ $nfiles -gt 0 ]; then
95 for ((i = 0; i < $ndirs; i++)); do
96 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
97 /dev/null || error "createmany $nfiles"
100 createmany -d $DIR/$tdir/e $ndirs
103 if [ ! -z $igif ]; then
104 touch $DIR/$tdir/dummy
105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
108 echo "prepared $(date)."
114 #define OBD_FAIL_LFSCK_DELAY1 0x1600
115 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
116 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
118 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
120 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
121 [ "$STATUS" == "scanning-phase1" ] ||
122 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
124 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
126 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
127 [ "$STATUS" == "stopped" ] ||
128 error "(6) Expect 'stopped', but got '$STATUS'"
130 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
132 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
133 [ "$STATUS" == "scanning-phase1" ] ||
134 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
138 mdd.${MDT_DEV}.lfsck_namespace |
139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
141 error "(9) unexpected status"
144 local repaired=$($SHOW_NAMESPACE |
145 awk '/^updated_phase1/ { print $2 }')
146 [ $repaired -eq 0 ] ||
147 error "(10) Expect nothing to be repaired, but got: $repaired"
149 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
150 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
151 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
152 mdd.${MDT_DEV}.lfsck_namespace |
153 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
155 error "(12) unexpected status"
158 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
159 [ $((scanned1 + 1)) -eq $scanned2 ] ||
160 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
162 echo "stopall, should NOT crash LU-3649"
163 stopall || error "(14) Fail to stopall"
165 run_test 0 "Control LFSCK manually"
168 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
169 skip "OI Scrub not implemented for ZFS" && return
173 #define OBD_FAIL_FID_INDIR 0x1501
174 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
175 touch $DIR/$tdir/dummy
177 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
179 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
180 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
181 mdd.${MDT_DEV}.lfsck_namespace |
182 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
184 error "(4) unexpected status"
187 local repaired=$($SHOW_NAMESPACE |
188 awk '/^dirent_repaired/ { print $2 }')
189 # for interop with old server
190 [ -z "$repaired" ] &&
191 repaired=$($SHOW_NAMESPACE |
192 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 1 ] ||
195 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
197 mount_client $MOUNT || error "(6) Fail to start client!"
199 #define OBD_FAIL_FID_LOOKUP 0x1505
200 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
201 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
203 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
205 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
209 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
210 skip "OI Scrub not implemented for ZFS" && return
214 #define OBD_FAIL_FID_INLMA 0x1502
215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
216 touch $DIR/$tdir/dummy
218 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
220 #define OBD_FAIL_FID_NOLMA 0x1506
221 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
222 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
223 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
224 mdd.${MDT_DEV}.lfsck_namespace |
225 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
227 error "(4) unexpected status"
230 local repaired=$($SHOW_NAMESPACE |
231 awk '/^dirent_repaired/ { print $2 }')
232 # for interop with old server
233 [ -z "$repaired" ] &&
234 repaired=$($SHOW_NAMESPACE |
235 awk '/^updated_phase1/ { print $2 }')
237 [ $repaired -eq 1 ] ||
238 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
241 mount_client $MOUNT || error "(6) Fail to start client!"
243 #define OBD_FAIL_FID_LOOKUP 0x1505
244 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
245 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
249 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
254 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
256 touch $DIR/$tdir/dummy
258 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
260 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
261 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
262 mdd.${MDT_DEV}.lfsck_namespace |
263 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
265 error "(4) unexpected status"
268 local repaired=$($SHOW_NAMESPACE |
269 awk '/^linkea_repaired/ { print $2 }')
270 # for interop with old server
271 [ -z "$repaired" ] &&
272 repaired=$($SHOW_NAMESPACE |
273 awk '/^updated_phase2/ { print $2 }')
275 [ $repaired -eq 1 ] ||
276 error "(5) Fail to repair crashed linkEA: $repaired"
278 mount_client $MOUNT || error "(6) Fail to start client!"
280 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
281 error "(7) Fail to stat $DIR/$tdir/dummy"
283 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
284 local dummyname=$($LFS fid2path $DIR $dummyfid)
285 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
286 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
288 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
294 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
296 touch $DIR/$tdir/dummy
298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
300 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
301 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
302 mdd.${MDT_DEV}.lfsck_namespace |
303 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
305 error "(4) unexpected status"
308 local repaired=$($SHOW_NAMESPACE |
309 awk '/^updated_phase2/ { print $2 }')
310 [ $repaired -eq 1 ] ||
311 error "(5) Fail to repair crashed linkEA: $repaired"
313 mount_client $MOUNT || error "(6) Fail to start client!"
315 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
316 error "(7) Fail to stat $DIR/$tdir/dummy"
318 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
319 local dummyname=$($LFS fid2path $DIR $dummyfid)
320 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
321 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
323 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
329 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
330 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
331 touch $DIR/$tdir/dummy
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
335 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
336 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
337 mdd.${MDT_DEV}.lfsck_namespace |
338 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
340 error "(4) unexpected status"
343 local repaired=$($SHOW_NAMESPACE |
344 awk '/^updated_phase2/ { print $2 }')
345 [ $repaired -eq 1 ] ||
346 error "(5) Fail to repair crashed linkEA: $repaired"
348 mount_client $MOUNT || error "(6) Fail to start client!"
350 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
351 error "(7) Fail to stat $DIR/$tdir/dummy"
353 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
354 local dummyname=$($LFS fid2path $DIR $dummyfid)
355 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
356 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
358 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
364 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
366 touch $DIR/$tdir/dummy
368 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
370 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
371 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
372 mdd.${MDT_DEV}.lfsck_namespace |
373 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
375 error "(4) unexpected status"
378 local repaired=$($SHOW_NAMESPACE |
379 awk '/^linkea_repaired/ { print $2 }')
380 [ $repaired -eq 1 ] ||
381 error "(5) Fail to repair crashed linkEA: $repaired"
383 mount_client $MOUNT || error "(6) Fail to start client!"
385 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
386 error "(7) Fail to stat $DIR/$tdir/dummy"
388 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
389 local dummyname=$($LFS fid2path $DIR $dummyfid)
390 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
391 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
393 run_test 2d "LFSCK can recover the missing linkEA entry"
397 [ $MDSCOUNT -lt 2 ] &&
398 skip "We need at least 2 MDSes for this test" && return
402 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
404 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
406 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
407 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
409 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
410 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
411 mdd.${MDT_DEV}.lfsck_namespace |
412 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
414 error "(4) unexpected status"
417 local repaired=$($SHOW_NAMESPACE |
418 awk '/^linkea_repaired/ { print $2 }')
419 [ $repaired -eq 1 ] ||
420 error "(5) Fail to repair crashed linkEA: $repaired"
422 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
423 local name=$($LFS fid2path $DIR $fid)
424 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
425 error "(6) Fail to repair linkEA: $fid $name"
427 run_test 2e "namespace LFSCK can verify remote object linkEA"
433 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
434 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
435 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
437 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
438 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
439 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
441 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
443 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
445 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
446 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
447 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
451 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
452 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
453 mdd.${MDT_DEV}.lfsck_namespace |
454 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
456 error "(10) unexpected status"
459 local checked=$($SHOW_NAMESPACE |
460 awk '/^checked_phase2/ { print $2 }')
461 [ $checked -ge 4 ] ||
462 error "(11) Fail to check multiple-linked object: $checked"
464 local repaired=$($SHOW_NAMESPACE |
465 awk '/^multiple_linked_repaired/ { print $2 }')
466 [ $repaired -ge 2 ] ||
467 error "(12) Fail to repair multiple-linked object: $repaired"
469 run_test 3 "LFSCK can verify multiple-linked objects"
473 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
474 skip "OI Scrub not implemented for ZFS" && return
477 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
478 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
480 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
481 echo "start $SINGLEMDS with disabling OI scrub"
482 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
483 error "(2) Fail to start MDS!"
485 #define OBD_FAIL_LFSCK_DELAY2 0x1601
486 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
487 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
488 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
489 mdd.${MDT_DEV}.lfsck_namespace |
490 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
492 error "(5) unexpected status"
495 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
496 [ "$STATUS" == "scanning-phase1" ] ||
497 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
499 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
500 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
501 mdd.${MDT_DEV}.lfsck_namespace |
502 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
504 error "(7) unexpected status"
507 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
508 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
510 local repaired=$($SHOW_NAMESPACE |
511 awk '/^dirent_repaired/ { print $2 }')
512 # for interop with old server
513 [ -z "$repaired" ] &&
514 repaired=$($SHOW_NAMESPACE |
515 awk '/^updated_phase1/ { print $2 }')
517 [ $repaired -ge 9 ] ||
518 error "(9) Fail to re-generate FID-in-dirent: $repaired"
520 mount_client $MOUNT || error "(10) Fail to start client!"
522 #define OBD_FAIL_FID_LOOKUP 0x1505
523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
524 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
525 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
527 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
531 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
532 skip "OI Scrub not implemented for ZFS" && return
535 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
536 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
538 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
539 echo "start $SINGLEMDS with disabling OI scrub"
540 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
541 error "(2) Fail to start MDS!"
543 #define OBD_FAIL_LFSCK_DELAY2 0x1601
544 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
545 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
550 error "(5) unexpected status"
553 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
554 [ "$STATUS" == "scanning-phase1" ] ||
555 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
557 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
558 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
559 mdd.${MDT_DEV}.lfsck_namespace |
560 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
562 error "(7) unexpected status"
565 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
566 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
568 local repaired=$($SHOW_NAMESPACE |
569 awk '/^dirent_repaired/ { print $2 }')
570 # for interop with old server
571 [ -z "$repaired" ] &&
572 repaired=$($SHOW_NAMESPACE |
573 awk '/^updated_phase1/ { print $2 }')
575 [ $repaired -ge 2 ] ||
576 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
584 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
586 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
587 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
588 local dummyname=$($LFS fid2path $DIR $dummyfid)
589 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
590 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
592 run_test 5 "LFSCK can handle IGIF object upgrading"
597 #define OBD_FAIL_LFSCK_DELAY1 0x1600
598 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
599 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
601 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
602 [ "$STATUS" == "scanning-phase1" ] ||
603 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
605 # Sleep 3 sec to guarantee at least one object processed by LFSCK
607 # Fail the LFSCK to guarantee there is at least one checkpoint
608 #define OBD_FAIL_LFSCK_FATAL1 0x1608
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
610 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
611 mdd.${MDT_DEV}.lfsck_namespace |
612 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
614 error "(4) unexpected status"
617 local POS0=$($SHOW_NAMESPACE |
618 awk '/^last_checkpoint_position/ { print $2 }' |
621 #define OBD_FAIL_LFSCK_DELAY1 0x1600
622 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
623 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
625 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
626 [ "$STATUS" == "scanning-phase1" ] ||
627 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
629 local POS1=$($SHOW_NAMESPACE |
630 awk '/^latest_start_position/ { print $2 }' |
632 [[ $POS0 -lt $POS1 ]] ||
633 error "(7) Expect larger than: $POS0, but got $POS1"
635 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
636 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
637 mdd.${MDT_DEV}.lfsck_namespace |
638 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
640 error "(8) unexpected status"
643 run_test 6a "LFSCK resumes from last checkpoint (1)"
648 #define OBD_FAIL_LFSCK_DELAY2 0x1601
649 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
650 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
652 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
653 [ "$STATUS" == "scanning-phase1" ] ||
654 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
656 # Sleep 5 sec to guarantee that we are in the directory scanning
658 # Fail the LFSCK to guarantee there is at least one checkpoint
659 #define OBD_FAIL_LFSCK_FATAL2 0x1609
660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
661 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
662 mdd.${MDT_DEV}.lfsck_namespace |
663 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
665 error "(4) unexpected status"
668 local O_POS0=$($SHOW_NAMESPACE |
669 awk '/^last_checkpoint_position/ { print $2 }' |
672 local D_POS0=$($SHOW_NAMESPACE |
673 awk '/^last_checkpoint_position/ { print $4 }')
675 #define OBD_FAIL_LFSCK_DELAY2 0x1601
676 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
677 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
679 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
680 [ "$STATUS" == "scanning-phase1" ] ||
681 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
683 local O_POS1=$($SHOW_NAMESPACE |
684 awk '/^latest_start_position/ { print $2 }' |
686 local D_POS1=$($SHOW_NAMESPACE |
687 awk '/^latest_start_position/ { print $4 }')
689 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
690 [[ $O_POS0 -lt $O_POS1 ]] ||
691 error "(7.1) $O_POS1 is not larger than $O_POS0"
693 [[ $D_POS0 -lt $D_POS1 ]] ||
694 error "(7.2) $D_POS1 is not larger than $D_POS0"
697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
698 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
699 mdd.${MDT_DEV}.lfsck_namespace |
700 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
702 error "(8) unexpected status"
705 run_test 6b "LFSCK resumes from last checkpoint (2)"
712 #define OBD_FAIL_LFSCK_DELAY2 0x1601
713 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
714 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
716 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
717 [ "$STATUS" == "scanning-phase1" ] ||
718 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
720 # Sleep 3 sec to guarantee at least one object processed by LFSCK
722 echo "stop $SINGLEMDS"
723 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
725 echo "start $SINGLEMDS"
726 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
727 error "(5) Fail to start MDS!"
729 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
730 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
731 mdd.${MDT_DEV}.lfsck_namespace |
732 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
734 error "(6) unexpected status"
737 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
743 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
745 for ((i = 0; i < 20; i++)); do
746 touch $DIR/$tdir/dummy${i}
749 #define OBD_FAIL_LFSCK_DELAY3 0x1602
750 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
751 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
752 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
753 mdd.${MDT_DEV}.lfsck_namespace |
754 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
756 error "(4) unexpected status"
759 echo "stop $SINGLEMDS"
760 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
762 echo "start $SINGLEMDS"
763 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
764 error "(6) Fail to start MDS!"
766 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
767 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
768 mdd.${MDT_DEV}.lfsck_namespace |
769 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
771 error "(7) unexpected status"
774 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
779 formatall > /dev/null
785 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
786 [ "$STATUS" == "init" ] ||
787 error "(2) Expect 'init', but got '$STATUS'"
789 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
790 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
791 mkdir $DIR/$tdir/crashed
793 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
794 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
795 for ((i = 0; i < 5; i++)); do
796 touch $DIR/$tdir/dummy${i}
799 umount_client $MOUNT || error "(3) Fail to stop client!"
801 #define OBD_FAIL_LFSCK_DELAY2 0x1601
802 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
803 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
805 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
806 [ "$STATUS" == "scanning-phase1" ] ||
807 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
809 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
811 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
812 [ "$STATUS" == "stopped" ] ||
813 error "(7) Expect 'stopped', but got '$STATUS'"
815 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
817 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
818 [ "$STATUS" == "scanning-phase1" ] ||
819 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
821 #define OBD_FAIL_LFSCK_FATAL2 0x1609
822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
823 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
824 mdd.${MDT_DEV}.lfsck_namespace |
825 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
827 error "(10) unexpected status"
830 #define OBD_FAIL_LFSCK_DELAY1 0x1600
831 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
832 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
834 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
835 [ "$STATUS" == "scanning-phase1" ] ||
836 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
838 #define OBD_FAIL_LFSCK_CRASH 0x160a
839 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
842 echo "stop $SINGLEMDS"
843 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
845 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
846 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
848 echo "start $SINGLEMDS"
849 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
850 error "(14) Fail to start MDS!"
852 local timeout=$(max_recovery_time)
855 while [ $timer -lt $timeout ]; do
856 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
857 mdt.${MDT_DEV}.recovery_status |
858 awk '/^status/ { print \\\$2 }'")
859 [ "$STATUS" != "RECOVERING" ] && break;
864 [ $timer != $timeout ] ||
865 error "(14.1) recovery timeout"
867 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
868 [ "$STATUS" == "crashed" ] ||
869 error "(15) Expect 'crashed', but got '$STATUS'"
871 #define OBD_FAIL_LFSCK_DELAY2 0x1601
872 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
873 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
875 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
876 [ "$STATUS" == "scanning-phase1" ] ||
877 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
879 echo "stop $SINGLEMDS"
880 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
882 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
885 echo "start $SINGLEMDS"
886 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
887 error "(19) Fail to start MDS!"
890 while [ $timer -lt $timeout ]; do
891 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
892 mdt.${MDT_DEV}.recovery_status |
893 awk '/^status/ { print \\\$2 }'")
894 [ "$STATUS" != "RECOVERING" ] && break;
899 [ $timer != $timeout ] ||
900 error "(19.1) recovery timeout"
902 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
903 [ "$STATUS" == "paused" ] ||
904 error "(20) Expect 'paused', but got '$STATUS'"
906 #define OBD_FAIL_LFSCK_DELAY3 0x1602
907 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
909 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
910 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
911 mdd.${MDT_DEV}.lfsck_namespace |
912 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
914 error "(22) unexpected status"
917 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
918 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
919 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
922 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
923 mdd.${MDT_DEV}.lfsck_namespace |
924 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
926 error "(24) unexpected status"
929 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
930 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
932 run_test 8 "LFSCK state machine"
935 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
936 skip "Testing on UP system, the speed may be inaccurate."
942 local BASE_SPEED1=100
944 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
947 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
948 [ "$STATUS" == "scanning-phase1" ] ||
949 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
951 local SPEED=$($SHOW_NAMESPACE |
952 awk '/^average_speed_phase1/ { print $2 }')
954 # There may be time error, normally it should be less than 2 seconds.
955 # We allow another 20% schedule error.
957 # MAX_MARGIN = 1.2 = 12 / 10
958 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
959 RUN_TIME1 * 12 / 10))
960 [ $SPEED -lt $MAX_SPEED ] ||
961 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
964 local BASE_SPEED2=300
966 do_facet $SINGLEMDS \
967 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
970 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
971 # MIN_MARGIN = 0.8 = 8 / 10
972 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
973 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
974 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
975 [ $SPEED -gt $MIN_SPEED ] || {
976 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
977 error_ignore LU-5624 \
978 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
981 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
985 # MAX_MARGIN = 1.2 = 12 / 10
986 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
987 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
988 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
989 [ $SPEED -lt $MAX_SPEED ] ||
990 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
992 do_facet $SINGLEMDS \
993 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
995 wait_update_facet $SINGLEMDS \
996 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
997 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
998 error "(7) Failed to get expected 'completed'"
1000 run_test 9a "LFSCK speed control (1)"
1003 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1004 skip "Testing on UP system, the speed may be inaccurate."
1010 echo "Preparing another 50 * 50 files (with error) at $(date)."
1011 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1012 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1013 createmany -d $DIR/$tdir/d 50
1014 createmany -m $DIR/$tdir/f 50
1015 for ((i = 0; i < 50; i++)); do
1016 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1019 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1020 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1021 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1022 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1023 mdd.${MDT_DEV}.lfsck_namespace |
1024 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1026 error "(5) unexpected status"
1029 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1030 echo "Prepared at $(date)."
1032 local BASE_SPEED1=50
1034 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1037 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "scanning-phase2" ] ||
1039 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1041 local SPEED=$($SHOW_NAMESPACE |
1042 awk '/^average_speed_phase2/ { print $2 }')
1043 # There may be time error, normally it should be less than 2 seconds.
1044 # We allow another 20% schedule error.
1046 # MAX_MARGIN = 1.2 = 12 / 10
1047 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1048 RUN_TIME1 * 12 / 10))
1049 [ $SPEED -lt $MAX_SPEED ] ||
1050 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1052 # adjust speed limit
1053 local BASE_SPEED2=150
1055 do_facet $SINGLEMDS \
1056 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1059 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1060 # MIN_MARGIN = 0.8 = 8 / 10
1061 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1062 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1063 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1064 [ $SPEED -gt $MIN_SPEED ] || {
1065 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1066 error_ignore LU-5624 \
1067 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1070 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1074 # MAX_MARGIN = 1.2 = 12 / 10
1075 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1076 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1077 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1078 [ $SPEED -lt $MAX_SPEED ] ||
1079 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1081 do_facet $SINGLEMDS \
1082 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1083 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1084 mdd.${MDT_DEV}.lfsck_namespace |
1085 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1087 error "(11) unexpected status"
1090 run_test 9b "LFSCK speed control (2)"
1094 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1095 skip "lookup(..)/linkea on ZFS issue" && return
1099 echo "Preparing more files with error at $(date)."
1100 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1103 for ((i = 0; i < 1000; i = $((i+2)))); do
1104 mkdir -p $DIR/$tdir/d${i}
1105 touch $DIR/$tdir/f${i}
1106 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1109 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1112 for ((i = 1; i < 1000; i = $((i+2)))); do
1113 mkdir -p $DIR/$tdir/d${i}
1114 touch $DIR/$tdir/f${i}
1115 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1118 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1119 echo "Prepared at $(date)."
1121 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1123 umount_client $MOUNT
1124 mount_client $MOUNT || error "(3) Fail to start client!"
1126 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1129 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1130 [ "$STATUS" == "scanning-phase1" ] ||
1131 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1133 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1135 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1137 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1139 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1141 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1143 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1145 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1147 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1148 error "(14) Fail to softlink!"
1150 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1151 [ "$STATUS" == "scanning-phase1" ] ||
1152 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1154 do_facet $SINGLEMDS \
1155 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1156 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1157 mdd.${MDT_DEV}.lfsck_namespace |
1158 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1160 error "(16) unexpected status"
1163 run_test 10 "System is available during LFSCK scanning"
1166 ost_remove_lastid() {
1169 local rcmd="do_facet ost${ost}"
1171 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1173 # step 1: local mount
1174 mount_fstype ost${ost} || return 1
1175 # step 2: remove the specified LAST_ID
1176 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1178 unmount_fstype ost${ost} || return 2
1182 check_mount_and_prep
1183 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1184 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1189 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1191 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1192 error "(2) Fail to start ost1"
1194 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1195 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1197 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1198 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1200 wait_update_facet ost1 "$LCTL get_param -n \
1201 obdfilter.${OST_DEV}.lfsck_layout |
1202 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1204 error "(5) unexpected status"
1207 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1209 wait_update_facet ost1 "$LCTL get_param -n \
1210 obdfilter.${OST_DEV}.lfsck_layout |
1211 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1213 error "(6) unexpected status"
1216 echo "the LAST_ID(s) should have been rebuilt"
1217 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1218 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1220 run_test 11a "LFSCK can rebuild lost last_id"
1223 check_mount_and_prep
1224 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1226 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1227 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1228 do_facet ost1 $LCTL set_param fail_loc=0x160d
1229 createmany -o $DIR/$tdir/f 64
1230 local lastid1=$(do_facet ost1 "lctl get_param -n \
1231 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1232 awk -F: '{ print $2 }')
1234 umount_client $MOUNT
1235 stop ost1 || error "(1) Fail to stop ost1"
1237 #define OBD_FAIL_OST_ENOSPC 0x215
1238 do_facet ost1 $LCTL set_param fail_loc=0x215
1240 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1241 error "(2) Fail to start ost1"
1243 for ((i = 0; i < 60; i++)); do
1244 lastid2=$(do_facet ost1 "lctl get_param -n \
1245 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1246 awk -F: '{ print $2 }')
1247 [ ! -z $lastid2 ] && break;
1251 echo "the on-disk LAST_ID should be smaller than the expected one"
1252 [ $lastid1 -gt $lastid2 ] ||
1253 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1255 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1256 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1258 wait_update_facet ost1 "$LCTL get_param -n \
1259 obdfilter.${OST_DEV}.lfsck_layout |
1260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1262 error "(6) unexpected status"
1265 stop ost1 || error "(7) Fail to stop ost1"
1267 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1268 error "(8) Fail to start ost1"
1270 echo "the on-disk LAST_ID should have been rebuilt"
1271 wait_update_facet ost1 "$LCTL get_param -n \
1272 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1273 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1274 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1275 error "(9) expect lastid1 0x100000000:$lastid1"
1278 do_facet ost1 $LCTL set_param fail_loc=0
1279 stopall || error "(10) Fail to stopall"
1281 run_test 11b "LFSCK can rebuild crashed last_id"
1284 [ $MDSCOUNT -lt 2 ] &&
1285 skip "We need at least 2 MDSes for test_12" && return
1287 check_mount_and_prep
1288 for k in $(seq $MDSCOUNT); do
1289 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1290 createmany -o $DIR/$tdir/${k}/f 100 ||
1291 error "(0) Fail to create 100 files."
1294 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1295 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1296 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1298 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1299 for k in $(seq $MDSCOUNT); do
1300 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1301 mdd.$(facet_svc mds${k}).lfsck_namespace |
1302 awk '/^status/ { print $2 }')
1303 [ "$STATUS" == "scanning-phase1" ] ||
1304 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1307 echo "Stop namespace LFSCK on all targets by single lctl command."
1308 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1309 error "(4) Fail to stop LFSCK on all devices!"
1311 echo "All the LFSCK targets should be in 'stopped' status."
1312 for k in $(seq $MDSCOUNT); do
1313 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1314 mdd.$(facet_svc mds${k}).lfsck_namespace |
1315 awk '/^status/ { print $2 }')
1316 [ "$STATUS" == "stopped" ] ||
1317 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1320 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1321 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1322 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1324 echo "All the LFSCK targets should be in 'completed' status."
1325 for k in $(seq $MDSCOUNT); do
1326 wait_update_facet mds${k} "$LCTL get_param -n \
1327 mdd.$(facet_svc mds${k}).lfsck_namespace |
1328 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1329 error "(7) MDS${k} is not the expected 'completed'"
1332 echo "Start layout LFSCK on all targets by single command (-s 1)."
1333 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1334 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1336 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1337 for k in $(seq $MDSCOUNT); do
1338 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1339 mdd.$(facet_svc mds${k}).lfsck_layout |
1340 awk '/^status/ { print $2 }')
1341 [ "$STATUS" == "scanning-phase1" ] ||
1342 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1345 echo "Stop layout LFSCK on all targets by single lctl command."
1346 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1347 error "(10) Fail to stop LFSCK on all devices!"
1349 echo "All the LFSCK targets should be in 'stopped' status."
1350 for k in $(seq $MDSCOUNT); do
1351 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1352 mdd.$(facet_svc mds${k}).lfsck_layout |
1353 awk '/^status/ { print $2 }')
1354 [ "$STATUS" == "stopped" ] ||
1355 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1358 for k in $(seq $OSTCOUNT); do
1359 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1360 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1361 awk '/^status/ { print $2 }')
1362 [ "$STATUS" == "stopped" ] ||
1363 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1366 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1367 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1368 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1370 echo "All the LFSCK targets should be in 'completed' status."
1371 for k in $(seq $MDSCOUNT); do
1372 # The LFSCK status query internal is 30 seconds. For the case
1373 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1374 # time to guarantee the status sync up.
1375 wait_update_facet mds${k} "$LCTL get_param -n \
1376 mdd.$(facet_svc mds${k}).lfsck_layout |
1377 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1378 error "(14) MDS${k} is not the expected 'completed'"
1381 run_test 12 "single command to trigger LFSCK on all devices"
1385 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1386 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1387 echo "MDT-object FID."
1390 check_mount_and_prep
1392 echo "Inject failure stub to simulate bad lmm_oi"
1393 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1394 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1395 createmany -o $DIR/$tdir/f 32
1396 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1398 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1399 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1402 mdd.${MDT_DEV}.lfsck_layout |
1403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1405 error "(2) unexpected status"
1408 local repaired=$($SHOW_LAYOUT |
1409 awk '/^repaired_others/ { print $2 }')
1410 [ $repaired -eq 32 ] ||
1411 error "(3) Fail to repair crashed lmm_oi: $repaired"
1413 run_test 13 "LFSCK can repair crashed lmm_oi"
1417 echo "The OST-object referenced by the MDT-object should be there;"
1418 echo "otherwise, the LFSCK should re-create the missing OST-object."
1421 check_mount_and_prep
1422 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1424 local count=$(precreated_ost_obj_count 0 0)
1426 echo "Inject failure stub to simulate dangling referenced MDT-object"
1427 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1428 do_facet ost1 $LCTL set_param fail_loc=0x1610
1429 createmany -o $DIR/$tdir/f $((count + 31))
1430 touch $DIR/$tdir/guard
1431 do_facet ost1 $LCTL set_param fail_loc=0
1433 start_full_debug_logging
1435 # exhaust other pre-created dangling cases
1436 count=$(precreated_ost_obj_count 0 0)
1437 createmany -o $DIR/$tdir/a $count ||
1438 error "(0) Fail to create $count files."
1440 echo "'ls' should fail because of dangling referenced MDT-object"
1441 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1443 echo "Trigger layout LFSCK to find out dangling reference"
1444 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1446 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1447 mdd.${MDT_DEV}.lfsck_layout |
1448 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1450 error "(3) unexpected status"
1453 local repaired=$($SHOW_LAYOUT |
1454 awk '/^repaired_dangling/ { print $2 }')
1455 [ $repaired -ge 32 ] ||
1456 error "(4) Fail to repair dangling reference: $repaired"
1458 echo "'stat' should fail because of not repair dangling by default"
1459 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1461 echo "Trigger layout LFSCK to repair dangling reference"
1462 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1464 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1465 mdd.${MDT_DEV}.lfsck_layout |
1466 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1468 error "(7) unexpected status"
1471 # There may be some async LFSCK updates in processing, wait for
1472 # a while until the target reparation has been done. LU-4970.
1474 echo "'stat' should success after layout LFSCK repairing"
1475 wait_update_facet client "stat $DIR/$tdir/guard |
1476 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1477 stat $DIR/$tdir/guard
1479 error "(8) unexpected size"
1482 repaired=$($SHOW_LAYOUT |
1483 awk '/^repaired_dangling/ { print $2 }')
1484 [ $repaired -ge 32 ] ||
1485 error "(9) Fail to repair dangling reference: $repaired"
1487 stop_full_debug_logging
1489 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1493 echo "If the OST-object referenced by the MDT-object back points"
1494 echo "to some non-exist MDT-object, then the LFSCK should repair"
1495 echo "the OST-object to back point to the right MDT-object."
1498 check_mount_and_prep
1499 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1501 echo "Inject failure stub to make the OST-object to back point to"
1502 echo "non-exist MDT-object."
1503 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1505 do_facet ost1 $LCTL set_param fail_loc=0x1611
1506 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1507 cancel_lru_locks osc
1508 do_facet ost1 $LCTL set_param fail_loc=0
1510 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1511 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1513 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1514 mdd.${MDT_DEV}.lfsck_layout |
1515 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1517 error "(2) unexpected status"
1520 local repaired=$($SHOW_LAYOUT |
1521 awk '/^repaired_unmatched_pair/ { print $2 }')
1522 [ $repaired -eq 1 ] ||
1523 error "(3) Fail to repair unmatched pair: $repaired"
1525 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1529 echo "If the OST-object referenced by the MDT-object back points"
1530 echo "to other MDT-object that doesn't recognize the OST-object,"
1531 echo "then the LFSCK should repair it to back point to the right"
1532 echo "MDT-object (the first one)."
1535 check_mount_and_prep
1536 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1537 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1538 cancel_lru_locks osc
1540 echo "Inject failure stub to make the OST-object to back point to"
1541 echo "other MDT-object"
1543 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1544 do_facet ost1 $LCTL set_param fail_loc=0x1612
1545 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1546 cancel_lru_locks osc
1547 do_facet ost1 $LCTL set_param fail_loc=0
1549 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1550 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1552 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1553 mdd.${MDT_DEV}.lfsck_layout |
1554 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1556 error "(2) unexpected status"
1559 local repaired=$($SHOW_LAYOUT |
1560 awk '/^repaired_unmatched_pair/ { print $2 }')
1561 [ $repaired -eq 1 ] ||
1562 error "(3) Fail to repair unmatched pair: $repaired"
1564 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1568 echo "If the OST-object's owner information does not match the owner"
1569 echo "information stored in the MDT-object, then the LFSCK trust the"
1570 echo "MDT-object and update the OST-object's owner information."
1573 check_mount_and_prep
1574 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1575 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1576 cancel_lru_locks osc
1578 echo "Inject failure stub to skip OST-object owner changing"
1579 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1581 chown 1.1 $DIR/$tdir/f0
1582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1584 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1587 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1589 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1590 mdd.${MDT_DEV}.lfsck_layout |
1591 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1593 error "(2) unexpected status"
1596 local repaired=$($SHOW_LAYOUT |
1597 awk '/^repaired_inconsistent_owner/ { print $2 }')
1598 [ $repaired -eq 1 ] ||
1599 error "(3) Fail to repair inconsistent owner: $repaired"
1601 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1605 echo "If more than one MDT-objects reference the same OST-object,"
1606 echo "and the OST-object only recognizes one MDT-object, then the"
1607 echo "LFSCK should create new OST-objects for such non-recognized"
1611 check_mount_and_prep
1612 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1614 echo "Inject failure stub to make two MDT-objects to refernce"
1615 echo "the OST-object"
1617 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1618 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1620 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1621 cancel_lru_locks osc
1623 createmany -o $DIR/$tdir/f 1
1625 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1627 cancel_lru_locks mdc
1628 cancel_lru_locks osc
1630 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1631 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1632 [ $size -eq 1048576 ] ||
1633 error "(1) f0 (wrong) size should be 1048576, but got $size"
1635 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1638 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1640 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1641 mdd.${MDT_DEV}.lfsck_layout |
1642 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1644 error "(3) unexpected status"
1647 local repaired=$($SHOW_LAYOUT |
1648 awk '/^repaired_multiple_referenced/ { print $2 }')
1649 [ $repaired -eq 1 ] ||
1650 error "(4) Fail to repair multiple references: $repaired"
1652 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1653 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1654 error "(5) Fail to write f0."
1655 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1656 [ $size -eq 1048576 ] ||
1657 error "(6) guard size should be 1048576, but got $size"
1659 run_test 17 "LFSCK can repair multiple references"
1663 echo "The target MDT-object is there, but related stripe information"
1664 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1665 echo "layout EA entries."
1668 check_mount_and_prep
1669 $LFS mkdir -i 0 $DIR/$tdir/a1
1670 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1671 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1673 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1675 $LFS path2fid $DIR/$tdir/a1/f1
1676 $LFS getstripe $DIR/$tdir/a1/f1
1678 if [ $MDSCOUNT -ge 2 ]; then
1679 $LFS mkdir -i 1 $DIR/$tdir/a2
1680 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1681 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1682 $LFS path2fid $DIR/$tdir/a2/f2
1683 $LFS getstripe $DIR/$tdir/a2/f2
1686 cancel_lru_locks osc
1688 echo "Inject failure, to make the MDT-object lost its layout EA"
1689 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1690 do_facet mds1 $LCTL set_param fail_loc=0x1615
1691 chown 1.1 $DIR/$tdir/a1/f1
1693 if [ $MDSCOUNT -ge 2 ]; then
1694 do_facet mds2 $LCTL set_param fail_loc=0x1615
1695 chown 1.1 $DIR/$tdir/a2/f2
1701 do_facet mds1 $LCTL set_param fail_loc=0
1702 if [ $MDSCOUNT -ge 2 ]; then
1703 do_facet mds2 $LCTL set_param fail_loc=0
1706 cancel_lru_locks mdc
1707 cancel_lru_locks osc
1709 echo "The file size should be incorrect since layout EA is lost"
1710 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1711 [ "$cur_size" != "$saved_size" ] ||
1712 error "(1) Expect incorrect file1 size"
1714 if [ $MDSCOUNT -ge 2 ]; then
1715 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1716 [ "$cur_size" != "$saved_size" ] ||
1717 error "(2) Expect incorrect file2 size"
1720 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1721 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1723 for k in $(seq $MDSCOUNT); do
1724 # The LFSCK status query internal is 30 seconds. For the case
1725 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1726 # time to guarantee the status sync up.
1727 wait_update_facet mds${k} "$LCTL get_param -n \
1728 mdd.$(facet_svc mds${k}).lfsck_layout |
1729 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1730 error "(4) MDS${k} is not the expected 'completed'"
1733 for k in $(seq $OSTCOUNT); do
1734 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1735 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1736 awk '/^status/ { print $2 }')
1737 [ "$cur_status" == "completed" ] ||
1738 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1741 local repaired=$(do_facet mds1 $LCTL get_param -n \
1742 mdd.$(facet_svc mds1).lfsck_layout |
1743 awk '/^repaired_orphan/ { print $2 }')
1744 [ $repaired -eq 1 ] ||
1745 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1747 if [ $MDSCOUNT -ge 2 ]; then
1748 repaired=$(do_facet mds2 $LCTL get_param -n \
1749 mdd.$(facet_svc mds2).lfsck_layout |
1750 awk '/^repaired_orphan/ { print $2 }')
1751 [ $repaired -eq 2 ] ||
1752 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1755 $LFS path2fid $DIR/$tdir/a1/f1
1756 $LFS getstripe $DIR/$tdir/a1/f1
1758 if [ $MDSCOUNT -ge 2 ]; then
1759 $LFS path2fid $DIR/$tdir/a2/f2
1760 $LFS getstripe $DIR/$tdir/a2/f2
1763 echo "The file size should be correct after layout LFSCK scanning"
1764 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1765 [ "$cur_size" == "$saved_size" ] ||
1766 error "(7) Expect file1 size $saved_size, but got $cur_size"
1768 if [ $MDSCOUNT -ge 2 ]; then
1769 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1770 [ "$cur_size" == "$saved_size" ] ||
1771 error "(8) Expect file2 size $saved_size, but got $cur_size"
1774 run_test 18a "Find out orphan OST-object and repair it (1)"
1778 echo "The target MDT-object is lost. The LFSCK should re-create the"
1779 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1780 echo "can move it back to normal namespace manually."
1783 check_mount_and_prep
1784 $LFS mkdir -i 0 $DIR/$tdir/a1
1785 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1786 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1787 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1788 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1790 $LFS getstripe $DIR/$tdir/a1/f1
1792 if [ $MDSCOUNT -ge 2 ]; then
1793 $LFS mkdir -i 1 $DIR/$tdir/a2
1794 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1795 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1796 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1798 $LFS getstripe $DIR/$tdir/a2/f2
1801 cancel_lru_locks osc
1803 echo "Inject failure, to simulate the case of missing the MDT-object"
1804 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1805 do_facet mds1 $LCTL set_param fail_loc=0x1616
1806 rm -f $DIR/$tdir/a1/f1
1808 if [ $MDSCOUNT -ge 2 ]; then
1809 do_facet mds2 $LCTL set_param fail_loc=0x1616
1810 rm -f $DIR/$tdir/a2/f2
1816 do_facet mds1 $LCTL set_param fail_loc=0
1817 if [ $MDSCOUNT -ge 2 ]; then
1818 do_facet mds2 $LCTL set_param fail_loc=0
1821 cancel_lru_locks mdc
1822 cancel_lru_locks osc
1824 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1825 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1827 for k in $(seq $MDSCOUNT); do
1828 # The LFSCK status query internal is 30 seconds. For the case
1829 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1830 # time to guarantee the status sync up.
1831 wait_update_facet mds${k} "$LCTL get_param -n \
1832 mdd.$(facet_svc mds${k}).lfsck_layout |
1833 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1834 error "(2) MDS${k} is not the expected 'completed'"
1837 for k in $(seq $OSTCOUNT); do
1838 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1839 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1840 awk '/^status/ { print $2 }')
1841 [ "$cur_status" == "completed" ] ||
1842 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1845 local repaired=$(do_facet mds1 $LCTL get_param -n \
1846 mdd.$(facet_svc mds1).lfsck_layout |
1847 awk '/^repaired_orphan/ { print $2 }')
1848 [ $repaired -eq 1 ] ||
1849 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1851 if [ $MDSCOUNT -ge 2 ]; then
1852 repaired=$(do_facet mds2 $LCTL get_param -n \
1853 mdd.$(facet_svc mds2).lfsck_layout |
1854 awk '/^repaired_orphan/ { print $2 }')
1855 [ $repaired -eq 2 ] ||
1856 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1859 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1860 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1861 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1863 if [ $MDSCOUNT -ge 2 ]; then
1864 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1865 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1868 $LFS path2fid $DIR/$tdir/a1/f1
1869 $LFS getstripe $DIR/$tdir/a1/f1
1871 if [ $MDSCOUNT -ge 2 ]; then
1872 $LFS path2fid $DIR/$tdir/a2/f2
1873 $LFS getstripe $DIR/$tdir/a2/f2
1876 echo "The file size should be correct after layout LFSCK scanning"
1877 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1878 [ "$cur_size" == "$saved_size" ] ||
1879 error "(7) Expect file1 size $saved_size, but got $cur_size"
1881 if [ $MDSCOUNT -ge 2 ]; then
1882 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1883 [ "$cur_size" == "$saved_size" ] ||
1884 error "(8) Expect file2 size $saved_size, but got $cur_size"
1887 run_test 18b "Find out orphan OST-object and repair it (2)"
1891 echo "The target MDT-object is lost, and the OST-object FID is missing."
1892 echo "The LFSCK should re-create the MDT-object with new FID under the "
1893 echo "directory .lustre/lost+found/MDTxxxx."
1896 check_mount_and_prep
1897 $LFS mkdir -i 0 $DIR/$tdir/a1
1898 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1900 echo "Inject failure, to simulate the case of missing parent FID"
1901 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1902 do_facet ost1 $LCTL set_param fail_loc=0x1617
1904 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1905 $LFS getstripe $DIR/$tdir/a1/f1
1907 if [ $MDSCOUNT -ge 2 ]; then
1908 $LFS mkdir -i 1 $DIR/$tdir/a2
1909 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
1910 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1911 $LFS getstripe $DIR/$tdir/a2/f2
1914 cancel_lru_locks osc
1916 echo "Inject failure, to simulate the case of missing the MDT-object"
1917 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1918 do_facet mds1 $LCTL set_param fail_loc=0x1616
1919 rm -f $DIR/$tdir/a1/f1
1921 if [ $MDSCOUNT -ge 2 ]; then
1922 do_facet mds2 $LCTL set_param fail_loc=0x1616
1923 rm -f $DIR/$tdir/a2/f2
1929 do_facet mds1 $LCTL set_param fail_loc=0
1930 if [ $MDSCOUNT -ge 2 ]; then
1931 do_facet mds2 $LCTL set_param fail_loc=0
1934 cancel_lru_locks mdc
1935 cancel_lru_locks osc
1937 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1938 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1940 for k in $(seq $MDSCOUNT); do
1941 # The LFSCK status query internal is 30 seconds. For the case
1942 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1943 # time to guarantee the status sync up.
1944 wait_update_facet mds${k} "$LCTL get_param -n \
1945 mdd.$(facet_svc mds${k}).lfsck_layout |
1946 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1947 error "(2) MDS${k} is not the expected 'completed'"
1950 for k in $(seq $OSTCOUNT); do
1951 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1952 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1953 awk '/^status/ { print $2 }')
1954 [ "$cur_status" == "completed" ] ||
1955 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1958 if [ $MDSCOUNT -ge 2 ]; then
1964 local repaired=$(do_facet mds1 $LCTL get_param -n \
1965 mdd.$(facet_svc mds1).lfsck_layout |
1966 awk '/^repaired_orphan/ { print $2 }')
1967 [ $repaired -eq $expected ] ||
1968 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1970 if [ $MDSCOUNT -ge 2 ]; then
1971 repaired=$(do_facet mds2 $LCTL get_param -n \
1972 mdd.$(facet_svc mds2).lfsck_layout |
1973 awk '/^repaired_orphan/ { print $2 }')
1974 [ $repaired -eq 0 ] ||
1975 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1978 ls -ail $MOUNT/.lustre/lost+found/
1980 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1981 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
1982 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
1984 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1987 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1988 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
1989 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
1991 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
1992 [ ! -z "$cname" ] ||
1993 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
1995 run_test 18c "Find out orphan OST-object and repair it (3)"
1999 echo "The target MDT-object layout EA slot is occpuied by some new"
2000 echo "created OST-object when repair dangling reference case. Such"
2001 echo "conflict OST-object has never been modified. Then when found"
2002 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2006 check_mount_and_prep
2008 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2009 echo "guard" > $DIR/$tdir/a1/f1
2010 echo "foo" > $DIR/$tdir/a1/f2
2011 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2012 $LFS path2fid $DIR/$tdir/a1/f1
2013 $LFS getstripe $DIR/$tdir/a1/f1
2014 $LFS path2fid $DIR/$tdir/a1/f2
2015 $LFS getstripe $DIR/$tdir/a1/f2
2016 cancel_lru_locks osc
2018 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2019 echo "to reference the same OST-object (which is f1's OST-obejct)."
2020 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2021 echo "dangling reference case, but f2's old OST-object is there."
2024 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2026 chown 1.1 $DIR/$tdir/a1/f2
2027 rm -f $DIR/$tdir/a1/f1
2030 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2032 echo "stopall to cleanup object cache"
2035 setupall > /dev/null
2037 echo "The file size should be incorrect since dangling referenced"
2038 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2039 [ "$cur_size" != "$saved_size" ] ||
2040 error "(1) Expect incorrect file2 size"
2042 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2043 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2045 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2046 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2048 wait_update_facet mds1 "$LCTL get_param -n \
2049 mdd.$(facet_svc mds1).lfsck_layout |
2050 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2051 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2053 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2055 for k in $(seq $MDSCOUNT); do
2056 # The LFSCK status query internal is 30 seconds. For the case
2057 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2058 # time to guarantee the status sync up.
2059 wait_update_facet mds${k} "$LCTL get_param -n \
2060 mdd.$(facet_svc mds${k}).lfsck_layout |
2061 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2062 error "(3) MDS${k} is not the expected 'completed'"
2065 for k in $(seq $OSTCOUNT); do
2066 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2067 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2068 awk '/^status/ { print $2 }')
2069 [ "$cur_status" == "completed" ] ||
2070 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2073 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2074 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2075 awk '/^repaired_orphan/ { print $2 }')
2076 [ $repaired -eq 1 ] ||
2077 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2079 echo "The file size should be correct after layout LFSCK scanning"
2080 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2081 [ "$cur_size" == "$saved_size" ] ||
2082 error "(6) Expect file2 size $saved_size, but got $cur_size"
2084 echo "The LFSCK should find back the original data."
2085 cat $DIR/$tdir/a1/f2
2086 $LFS path2fid $DIR/$tdir/a1/f2
2087 $LFS getstripe $DIR/$tdir/a1/f2
2089 run_test 18d "Find out orphan OST-object and repair it (4)"
2093 echo "The target MDT-object layout EA slot is occpuied by some new"
2094 echo "created OST-object when repair dangling reference case. Such"
2095 echo "conflict OST-object has been modified by others. To keep the"
2096 echo "new data, the LFSCK will create a new file to refernece this"
2097 echo "old orphan OST-object."
2100 check_mount_and_prep
2102 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2103 echo "guard" > $DIR/$tdir/a1/f1
2104 echo "foo" > $DIR/$tdir/a1/f2
2105 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2106 $LFS path2fid $DIR/$tdir/a1/f1
2107 $LFS getstripe $DIR/$tdir/a1/f1
2108 $LFS path2fid $DIR/$tdir/a1/f2
2109 $LFS getstripe $DIR/$tdir/a1/f2
2110 cancel_lru_locks osc
2112 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2113 echo "to reference the same OST-object (which is f1's OST-obejct)."
2114 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2115 echo "dangling reference case, but f2's old OST-object is there."
2118 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2119 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2120 chown 1.1 $DIR/$tdir/a1/f2
2121 rm -f $DIR/$tdir/a1/f1
2124 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2126 echo "stopall to cleanup object cache"
2129 setupall > /dev/null
2131 echo "The file size should be incorrect since dangling referenced"
2132 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2133 [ "$cur_size" != "$saved_size" ] ||
2134 error "(1) Expect incorrect file2 size"
2136 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2137 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2139 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2140 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2142 wait_update_facet mds1 "$LCTL get_param -n \
2143 mdd.$(facet_svc mds1).lfsck_layout |
2144 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2145 error "(3) MDS1 is not the expected 'scanning-phase2'"
2147 # to guarantee all updates are synced.
2151 echo "Write new data to f2 to modify the new created OST-object."
2152 echo "dummy" >> $DIR/$tdir/a1/f2
2154 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2156 for k in $(seq $MDSCOUNT); do
2157 # The LFSCK status query internal is 30 seconds. For the case
2158 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2159 # time to guarantee the status sync up.
2160 wait_update_facet mds${k} "$LCTL get_param -n \
2161 mdd.$(facet_svc mds${k}).lfsck_layout |
2162 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2163 error "(4) MDS${k} is not the expected 'completed'"
2166 for k in $(seq $OSTCOUNT); do
2167 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2168 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2169 awk '/^status/ { print $2 }')
2170 [ "$cur_status" == "completed" ] ||
2171 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2174 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2175 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2176 awk '/^repaired_orphan/ { print $2 }')
2177 [ $repaired -eq 1 ] ||
2178 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2180 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2181 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2182 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2184 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2185 [ ! -z "$cname" ] ||
2186 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2188 echo "The stub file should keep the original f2 data"
2189 cur_size=$(ls -il $cname | awk '{ print $6 }')
2190 [ "$cur_size" == "$saved_size" ] ||
2191 error "(9) Expect file2 size $saved_size, but got $cur_size"
2194 $LFS path2fid $cname
2195 $LFS getstripe $cname
2197 echo "The f2 should contains new data."
2198 cat $DIR/$tdir/a1/f2
2199 $LFS path2fid $DIR/$tdir/a1/f2
2200 $LFS getstripe $DIR/$tdir/a1/f2
2202 run_test 18e "Find out orphan OST-object and repair it (5)"
2205 [ $OSTCOUNT -lt 2 ] &&
2206 skip "The test needs at least 2 OSTs" && return
2209 echo "The target MDT-object is lost. The LFSCK should re-create the"
2210 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2211 echo "to verify some OST-object(s) during the first stage-scanning,"
2212 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2213 echo "should not be affected."
2216 check_mount_and_prep
2217 $LFS mkdir -i 0 $DIR/$tdir/a1
2218 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2219 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2220 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2221 $LFS mkdir -i 0 $DIR/$tdir/a2
2222 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2223 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2224 $LFS getstripe $DIR/$tdir/a1/f1
2225 $LFS getstripe $DIR/$tdir/a2/f2
2227 if [ $MDSCOUNT -ge 2 ]; then
2228 $LFS mkdir -i 1 $DIR/$tdir/a3
2229 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2230 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2231 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2232 $LFS mkdir -i 1 $DIR/$tdir/a4
2233 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2234 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2235 $LFS getstripe $DIR/$tdir/a3/f3
2236 $LFS getstripe $DIR/$tdir/a4/f4
2239 cancel_lru_locks osc
2241 echo "Inject failure, to simulate the case of missing the MDT-object"
2242 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2243 do_facet mds1 $LCTL set_param fail_loc=0x1616
2244 rm -f $DIR/$tdir/a1/f1
2245 rm -f $DIR/$tdir/a2/f2
2247 if [ $MDSCOUNT -ge 2 ]; then
2248 do_facet mds2 $LCTL set_param fail_loc=0x1616
2249 rm -f $DIR/$tdir/a3/f3
2250 rm -f $DIR/$tdir/a4/f4
2256 do_facet mds1 $LCTL set_param fail_loc=0
2257 if [ $MDSCOUNT -ge 2 ]; then
2258 do_facet mds2 $LCTL set_param fail_loc=0
2261 cancel_lru_locks mdc
2262 cancel_lru_locks osc
2264 echo "Inject failure, to simulate the OST0 fail to handle"
2265 echo "MDT0 LFSCK request during the first-stage scanning."
2266 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2267 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2269 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2270 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2272 for k in $(seq $MDSCOUNT); do
2273 # The LFSCK status query internal is 30 seconds. For the case
2274 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2275 # time to guarantee the status sync up.
2276 wait_update_facet mds${k} "$LCTL get_param -n \
2277 mdd.$(facet_svc mds${k}).lfsck_layout |
2278 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2279 error "(2) MDS${k} is not the expected 'partial'"
2282 wait_update_facet ost1 "$LCTL get_param -n \
2283 obdfilter.$(facet_svc ost1).lfsck_layout |
2284 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2285 error "(3) OST1 is not the expected 'partial'"
2288 wait_update_facet ost2 "$LCTL get_param -n \
2289 obdfilter.$(facet_svc ost2).lfsck_layout |
2290 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2291 error "(4) OST2 is not the expected 'completed'"
2294 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2296 local repaired=$(do_facet mds1 $LCTL get_param -n \
2297 mdd.$(facet_svc mds1).lfsck_layout |
2298 awk '/^repaired_orphan/ { print $2 }')
2299 [ $repaired -eq 1 ] ||
2300 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2302 if [ $MDSCOUNT -ge 2 ]; then
2303 repaired=$(do_facet mds2 $LCTL get_param -n \
2304 mdd.$(facet_svc mds2).lfsck_layout |
2305 awk '/^repaired_orphan/ { print $2 }')
2306 [ $repaired -eq 1 ] ||
2307 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2310 echo "Trigger layout LFSCK on all devices again to cleanup"
2311 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2313 for k in $(seq $MDSCOUNT); do
2314 # The LFSCK status query internal is 30 seconds. For the case
2315 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2316 # time to guarantee the status sync up.
2317 wait_update_facet mds${k} "$LCTL get_param -n \
2318 mdd.$(facet_svc mds${k}).lfsck_layout |
2319 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2320 error "(8) MDS${k} is not the expected 'completed'"
2323 for k in $(seq $OSTCOUNT); do
2324 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2325 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2326 awk '/^status/ { print $2 }')
2327 [ "$cur_status" == "completed" ] ||
2328 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2332 local repaired=$(do_facet mds1 $LCTL get_param -n \
2333 mdd.$(facet_svc mds1).lfsck_layout |
2334 awk '/^repaired_orphan/ { print $2 }')
2335 [ $repaired -eq 2 ] ||
2336 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2338 if [ $MDSCOUNT -ge 2 ]; then
2339 repaired=$(do_facet mds2 $LCTL get_param -n \
2340 mdd.$(facet_svc mds2).lfsck_layout |
2341 awk '/^repaired_orphan/ { print $2 }')
2342 [ $repaired -eq 2 ] ||
2343 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2346 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2349 check_mount_and_prep
2350 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2352 echo "foo" > $DIR/$tdir/a0
2353 echo "guard" > $DIR/$tdir/a1
2354 cancel_lru_locks osc
2356 echo "Inject failure, then client will offer wrong parent FID when read"
2357 do_facet ost1 $LCTL set_param -n \
2358 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2359 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2360 $LCTL set_param fail_loc=0x1619
2362 echo "Read RPC with wrong parent FID should be denied"
2363 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2364 $LCTL set_param fail_loc=0
2366 run_test 19a "OST-object inconsistency self detect"
2369 check_mount_and_prep
2370 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2372 echo "Inject failure stub to make the OST-object to back point to"
2373 echo "non-exist MDT-object"
2375 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2376 do_facet ost1 $LCTL set_param fail_loc=0x1611
2377 echo "foo" > $DIR/$tdir/f0
2378 cancel_lru_locks osc
2379 do_facet ost1 $LCTL set_param fail_loc=0
2381 echo "Nothing should be fixed since self detect and repair is disabled"
2382 local repaired=$(do_facet ost1 $LCTL get_param -n \
2383 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2384 awk '/^repaired/ { print $2 }')
2385 [ $repaired -eq 0 ] ||
2386 error "(1) Expected 0 repaired, but got $repaired"
2388 echo "Read RPC with right parent FID should be accepted,"
2389 echo "and cause parent FID on OST to be fixed"
2391 do_facet ost1 $LCTL set_param -n \
2392 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2393 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2395 repaired=$(do_facet ost1 $LCTL get_param -n \
2396 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2397 awk '/^repaired/ { print $2 }')
2398 [ $repaired -eq 1 ] ||
2399 error "(3) Expected 1 repaired, but got $repaired"
2401 run_test 19b "OST-object inconsistency self repair"
2404 [ $OSTCOUNT -lt 2 ] &&
2405 skip "The test needs at least 2 OSTs" && return
2408 echo "The target MDT-object and some of its OST-object are lost."
2409 echo "The LFSCK should find out the left OST-objects and re-create"
2410 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2411 echo "with the partial OST-objects (LOV EA hole)."
2413 echo "New client can access the file with LOV EA hole via normal"
2414 echo "system tools or commands without crash the system."
2416 echo "For old client, even though it cannot access the file with"
2417 echo "LOV EA hole, it should not cause the system crash."
2420 check_mount_and_prep
2421 $LFS mkdir -i 0 $DIR/$tdir/a1
2422 if [ $OSTCOUNT -gt 2 ]; then
2423 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2426 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2430 # 256 blocks on the stripe0.
2431 # 1 block on the stripe1 for 2 OSTs case.
2432 # 256 blocks on the stripe1 for other cases.
2433 # 1 block on the stripe2 if OSTs > 2
2434 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2435 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2436 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2438 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2439 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2440 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2443 $LFS getstripe $DIR/$tdir/a1/f0
2445 $LFS getstripe $DIR/$tdir/a1/f1
2447 $LFS getstripe $DIR/$tdir/a1/f2
2449 if [ $OSTCOUNT -gt 2 ]; then
2450 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2451 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2453 $LFS getstripe $DIR/$tdir/a1/f3
2456 cancel_lru_locks osc
2458 echo "Inject failure..."
2459 echo "To simulate f0 lost MDT-object"
2460 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2461 do_facet mds1 $LCTL set_param fail_loc=0x1616
2462 rm -f $DIR/$tdir/a1/f0
2464 echo "To simulate f1 lost MDT-object and OST-object0"
2465 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2466 do_facet mds1 $LCTL set_param fail_loc=0x161a
2467 rm -f $DIR/$tdir/a1/f1
2469 echo "To simulate f2 lost MDT-object and OST-object1"
2470 do_facet mds1 $LCTL set_param fail_val=1
2471 rm -f $DIR/$tdir/a1/f2
2473 if [ $OSTCOUNT -gt 2 ]; then
2474 echo "To simulate f3 lost MDT-object and OST-object2"
2475 do_facet mds1 $LCTL set_param fail_val=2
2476 rm -f $DIR/$tdir/a1/f3
2479 umount_client $MOUNT
2482 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2484 echo "Inject failure to slow down the LFSCK on OST0"
2485 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2486 do_facet ost1 $LCTL set_param fail_loc=0x161b
2488 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2489 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2492 do_facet ost1 $LCTL set_param fail_loc=0
2494 for k in $(seq $MDSCOUNT); do
2495 # The LFSCK status query internal is 30 seconds. For the case
2496 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2497 # time to guarantee the status sync up.
2498 wait_update_facet mds${k} "$LCTL get_param -n \
2499 mdd.$(facet_svc mds${k}).lfsck_layout |
2500 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2501 error "(2) MDS${k} is not the expected 'completed'"
2504 for k in $(seq $OSTCOUNT); do
2505 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2506 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2507 awk '/^status/ { print $2 }')
2508 [ "$cur_status" == "completed" ] ||
2509 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2512 local repaired=$(do_facet mds1 $LCTL get_param -n \
2513 mdd.$(facet_svc mds1).lfsck_layout |
2514 awk '/^repaired_orphan/ { print $2 }')
2515 if [ $OSTCOUNT -gt 2 ]; then
2516 [ $repaired -eq 9 ] ||
2517 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2519 [ $repaired -eq 4 ] ||
2520 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2523 mount_client $MOUNT || error "(5.0) Fail to start client!"
2525 LOV_PATTERN_F_HOLE=0x40000000
2528 # ${fid0}-R-0 is the old f0
2530 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2531 echo "Check $name, which is the old f0"
2533 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2535 local pattern=0x$($LFS getstripe -L $name)
2536 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2537 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2539 local stripes=$($LFS getstripe -c $name)
2540 if [ $OSTCOUNT -gt 2 ]; then
2541 [ $stripes -eq 3 ] ||
2542 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2544 [ $stripes -eq 2 ] ||
2545 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2548 local size=$(stat $name | awk '/Size:/ { print $2 }')
2549 [ $size -eq $((4096 * $bcount)) ] ||
2550 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2552 cat $name > /dev/null || error "(5.5) cannot read $name"
2554 echo "dummy" >> $name || error "(5.6) cannot write $name"
2556 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2558 touch $name || error "(5.8) cannot touch $name"
2560 rm -f $name || error "(5.9) cannot unlink $name"
2563 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2565 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2566 if [ $OSTCOUNT -gt 2 ]; then
2567 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2569 echo "Check $name, it contains the old f1's stripe1"
2572 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2574 pattern=0x$($LFS getstripe -L $name)
2575 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2576 error "(6.2) expect pattern flag hole, but got $pattern"
2578 stripes=$($LFS getstripe -c $name)
2579 if [ $OSTCOUNT -gt 2 ]; then
2580 [ $stripes -eq 3 ] ||
2581 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2583 [ $stripes -eq 2 ] ||
2584 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2587 size=$(stat $name | awk '/Size:/ { print $2 }')
2588 [ $size -eq $((4096 * $bcount)) ] ||
2589 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2591 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2593 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2594 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2597 [ $failures -eq 256 ] ||
2598 error "(6.6) expect 256 IO failures, but get $failures"
2600 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2601 [ $size -eq $((4096 * $bcount)) ] ||
2602 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2604 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2605 error "(6.8) write to the LOV EA hole should fail"
2607 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2608 error "(6.9) write to normal stripe should NOT fail"
2610 echo "foo" >> $name && error "(6.10) append write $name should fail"
2612 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2614 touch $name || error "(6.12) cannot touch $name"
2616 rm -f $name || error "(6.13) cannot unlink $name"
2619 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2621 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2622 if [ $OSTCOUNT -gt 2 ]; then
2623 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2625 echo "Check $name, it contains the old f2's stripe0"
2628 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2630 pattern=0x$($LFS getstripe -L $name)
2631 stripes=$($LFS getstripe -c $name)
2632 size=$(stat $name | awk '/Size:/ { print $2 }')
2633 if [ $OSTCOUNT -gt 2 ]; then
2634 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2635 error "(7.2.1) expect pattern flag hole, but got $pattern"
2637 [ $stripes -eq 3 ] ||
2638 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2640 [ $size -eq $((4096 * $bcount)) ] ||
2641 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2643 cat $name > /dev/null &&
2644 error "(7.5.1) normal read $name should fail"
2646 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2647 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2649 [ $failures -eq 256 ] ||
2650 error "(7.6) expect 256 IO failures, but get $failures"
2652 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2653 [ $size -eq $((4096 * $bcount)) ] ||
2654 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2656 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2657 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2659 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2660 error "(7.8.1) write to normal stripe should NOT fail"
2662 echo "foo" >> $name &&
2663 error "(7.8.3) append write $name should fail"
2665 chown $RUNAS_ID:$RUNAS_GID $name ||
2666 error "(7.9.1) cannot chown on $name"
2668 touch $name || error "(7.10.1) cannot touch $name"
2670 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2671 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2673 [ $stripes -eq 1 ] ||
2674 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2677 [ $size -eq $((4096 * (256 + 0))) ] ||
2678 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2680 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2682 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2684 chown $RUNAS_ID:$RUNAS_GID $name ||
2685 error "(7.9.2) cannot chown on $name"
2687 touch $name || error "(7.10.2) cannot touch $name"
2690 rm -f $name || error "(7.11) cannot unlink $name"
2692 [ $OSTCOUNT -le 2 ] && return
2695 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2697 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2698 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2700 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2702 pattern=0x$($LFS getstripe -L $name)
2703 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2704 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2706 stripes=$($LFS getstripe -c $name)
2707 # LFSCK does not know the old f3 had 3 stripes.
2708 # It only tries to find as much as possible.
2709 # The stripe count depends on the last stripe's offset.
2710 [ $stripes -eq 2 ] ||
2711 error "(8.3) expect the stripe count is 2, but got $stripes"
2713 size=$(stat $name | awk '/Size:/ { print $2 }')
2715 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2716 error "(8.4) expect the size $((4096 * 512)), but got $size"
2718 cat $name > /dev/null || error "(8.5) cannot read $name"
2720 echo "dummy" >> $name || error "(8.6) cannot write $name"
2722 chown $RUNAS_ID:$RUNAS_GID $name ||
2723 error "(8.7) cannot chown on $name"
2725 touch $name || error "(8.8) cannot touch $name"
2727 rm -f $name || error "(8.9) cannot unlink $name"
2729 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2732 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2733 skip "ignore the test if MDS is older than 2.5.59" && return
2735 check_mount_and_prep
2736 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2738 echo "Start all LFSCK components by default (-s 1)"
2739 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2740 error "Fail to start LFSCK"
2742 echo "namespace LFSCK should be in 'scanning-phase1' status"
2743 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2744 [ "$STATUS" == "scanning-phase1" ] ||
2745 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2747 echo "layout LFSCK should be in 'scanning-phase1' status"
2748 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2749 [ "$STATUS" == "scanning-phase1" ] ||
2750 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2752 echo "Stop all LFSCK components by default"
2753 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2754 error "Fail to stop LFSCK"
2756 run_test 21 "run all LFSCK components by default"
2759 [ $MDSCOUNT -lt 2 ] &&
2760 skip "We need at least 2 MDSes for this test" && return
2763 echo "The parent_A references the child directory via some name entry,"
2764 echo "but the child directory back references another parent_B via its"
2765 echo "".." name entry. The parent_B does not exist. Then the namesapce"
2766 echo "LFSCK will repair the child directory's ".." name entry."
2769 check_mount_and_prep
2771 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2772 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2774 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2775 echo "The dummy's dotdot name entry references the guard."
2776 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2777 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2778 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2779 error "(3) Fail to mkdir on MDT0"
2780 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2782 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2784 echo "Trigger namespace LFSCK to repair unmatched pairs"
2785 $START_NAMESPACE -A -r ||
2786 error "(5) Fail to start LFSCK for namespace"
2788 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2789 mdd.${MDT_DEV}.lfsck_namespace |
2790 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2792 error "(6) unexpected status"
2795 local repaired=$($SHOW_NAMESPACE |
2796 awk '/^unmatched_pairs_repaired/ { print $2 }')
2797 [ $repaired -eq 1 ] ||
2798 error "(7) Fail to repair unmatched pairs: $repaired"
2800 echo "'ls' should success after namespace LFSCK repairing"
2801 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2802 error "(8) ls should success."
2804 run_test 22a "LFSCK can repair unmatched pairs (1)"
2807 [ $MDSCOUNT -lt 2 ] &&
2808 skip "We need at least 2 MDSes for this test" && return
2811 echo "The parent_A references the child directory via the name entry_B,"
2812 echo "but the child directory back references another parent_C via its"
2813 echo "".." name entry. The parent_C exists, but there is no the name"
2814 echo "entry_B under the parent_C. Then the namesapce LFSCK will repair"
2815 echo "the child directory's ".." name entry and its linkEA."
2818 check_mount_and_prep
2820 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2821 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2823 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2824 echo "and bad linkEA. The dummy's dotdot name entry references the"
2825 echo "guard. The dummy's linkEA references n non-exist name entry."
2826 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2828 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2829 error "(3) Fail to mkdir on MDT0"
2830 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2832 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2833 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2834 local dummyname=$($LFS fid2path $DIR $dummyfid)
2835 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2836 error "(4) fid2path works unexpectedly."
2838 echo "Trigger namespace LFSCK to repair unmatched pairs"
2839 $START_NAMESPACE -A -r ||
2840 error "(5) Fail to start LFSCK for namespace"
2842 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2843 mdd.${MDT_DEV}.lfsck_namespace |
2844 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2846 error "(6) unexpected status"
2849 local repaired=$($SHOW_NAMESPACE |
2850 awk '/^unmatched_pairs_repaired/ { print $2 }')
2851 [ $repaired -eq 1 ] ||
2852 error "(7) Fail to repair unmatched pairs: $repaired"
2854 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2855 local dummyname=$($LFS fid2path $DIR $dummyfid)
2856 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2857 error "(8) fid2path does not work"
2859 run_test 22b "LFSCK can repair unmatched pairs (2)"
2862 [ $MDSCOUNT -lt 2 ] &&
2863 skip "We need at least 2 MDSes for this test" && return
2866 echo "The name entry is there, but the MDT-object for such name "
2867 echo "entry does not exist. The namespace LFSCK should find out "
2868 echo "and repair the inconsistency as required."
2871 check_mount_and_prep
2873 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2874 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2876 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2877 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2878 do_facet mds2 $LCTL set_param fail_loc=0x1620
2879 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2880 do_facet mds2 $LCTL set_param fail_loc=0
2882 echo "'ls' should fail because of dangling name entry"
2883 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2885 echo "Trigger namespace LFSCK to find out dangling name entry"
2886 $START_NAMESPACE -A -r ||
2887 error "(5) Fail to start LFSCK for namespace"
2889 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2890 mdd.${MDT_DEV}.lfsck_namespace |
2891 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2893 error "(6) unexpected status"
2896 local repaired=$($SHOW_NAMESPACE |
2897 awk '/^dangling_repaired/ { print $2 }')
2898 [ $repaired -eq 1 ] ||
2899 error "(7) Fail to repair dangling name entry: $repaired"
2901 echo "'ls' should fail because not re-create MDT-object by default"
2902 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2904 echo "Trigger namespace LFSCK again to repair dangling name entry"
2905 $START_NAMESPACE -A -r -C ||
2906 error "(9) Fail to start LFSCK for namespace"
2908 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2909 mdd.${MDT_DEV}.lfsck_namespace |
2910 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2912 error "(10) unexpected status"
2915 repaired=$($SHOW_NAMESPACE |
2916 awk '/^dangling_repaired/ { print $2 }')
2917 [ $repaired -eq 1 ] ||
2918 error "(11) Fail to repair dangling name entry: $repaired"
2920 echo "'ls' should success after namespace LFSCK repairing"
2921 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
2923 run_test 23a "LFSCK can repair dangling name entry (1)"
2927 echo "The objectA has multiple hard links, one of them corresponding"
2928 echo "to the name entry_B. But there is something wrong for the name"
2929 echo "entry_B and cause entry_B to references non-exist object_C."
2930 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2931 echo "as dangling, and re-create the lost object_C. When the LFSCK"
2932 echo "comes to the second-stage scanning, it will find that the"
2933 echo "former re-creating object_C is not proper, and will try to"
2934 echo "replace the object_C with the real object_A."
2937 check_mount_and_prep
2939 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2940 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2941 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2943 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2944 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2946 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2947 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2949 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2951 echo "'ls' should fail because of dangling name entry"
2952 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2953 error "(6) ls should fail."
2955 echo "Trigger namespace LFSCK to find out dangling name entry"
2956 $START_NAMESPACE -r -C ||
2957 error "(7) Fail to start LFSCK for namespace"
2959 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2960 mdd.${MDT_DEV}.lfsck_namespace |
2961 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2963 error "(8) unexpected status"
2966 local repaired=$($SHOW_NAMESPACE |
2967 awk '/^dangling_repaired/ { print $2 }')
2968 [ $repaired -eq 1 ] ||
2969 error "(9) Fail to repair dangling name entry: $repaired"
2971 repaired=$($SHOW_NAMESPACE |
2972 awk '/^multiple_linked_repaired/ { print $2 }')
2973 [ $repaired -eq 1 ] ||
2974 error "(10) Fail to drop the former created object: $repaired"
2976 local data=$(cat $DIR/$tdir/d0/foo)
2977 [ "$data" == "dummy" ] ||
2978 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
2980 run_test 23b "LFSCK can repair dangling name entry (2)"
2984 echo "The objectA has multiple hard links, one of them corresponding"
2985 echo "to the name entry_B. But there is something wrong for the name"
2986 echo "entry_B and cause entry_B to references non-exist object_C."
2987 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2988 echo "as dangling, and re-create the lost object_C. And then others"
2989 echo "modified the re-created object_C. When the LFSCK comes to the"
2990 echo "second-stage scanning, it will find that the former re-creating"
2991 echo "object_C maybe wrong and try to replace the object_C with the"
2992 echo "real object_A. But because object_C has been modified, so the"
2993 echo "LFSCK cannot replace it."
2996 check_mount_and_prep
2998 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2999 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3000 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3002 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3003 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3004 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3005 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3008 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3010 echo "'ls' should fail because of dangling name entry"
3011 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3012 error "(6) ls should fail."
3014 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3015 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3017 echo "Trigger namespace LFSCK to find out dangling name entry"
3018 $START_NAMESPACE -r -C ||
3019 error "(7) Fail to start LFSCK for namespace"
3021 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3022 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3023 stat $DIR/$tdir/guard
3025 error "(8) unexpected size"
3028 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3029 cancel_lru_locks osc
3031 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3032 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3033 mdd.${MDT_DEV}.lfsck_namespace |
3034 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3036 error "(10) unexpected status"
3039 local repaired=$($SHOW_NAMESPACE |
3040 awk '/^dangling_repaired/ { print $2 }')
3041 [ $repaired -eq 1 ] ||
3042 error "(11) Fail to repair dangling name entry: $repaired"
3044 local data=$(cat $DIR/$tdir/d0/foo)
3045 [ "$data" != "dummy" ] ||
3046 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3048 run_test 23c "LFSCK can repair dangling name entry (3)"
3051 [ $MDSCOUNT -lt 2 ] &&
3052 skip "We need at least 2 MDSes for this test" && return
3055 echo "Two MDT-objects back reference the same name entry via their"
3056 echo "each own linkEA entry, but the name entry only references one"
3057 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3058 echo "for the MDT-object that is not recognized. If such MDT-object"
3059 echo "has no other linkEA entry after the removing, then the LFSCK"
3060 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3063 check_mount_and_prep
3065 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3067 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3068 $LFS path2fid $DIR/$tdir/d0/guard
3070 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3071 $LFS path2fid $DIR/$tdir/d0/dummy
3074 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3075 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3077 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3080 touch $DIR/$tdir/d0/guard/foo ||
3081 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3083 echo "Inject failure stub on MDT0 to simulate the case that"
3084 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3085 echo "that references $DIR/$tdir/d0/guard/foo."
3086 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3087 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3088 echo "there with the same linkEA entry as another MDT-object"
3089 echo "$DIR/$tdir/d0/guard/foo has"
3091 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3093 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3094 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3095 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3096 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3097 rmdir $DIR/$tdir/d0/dummy/foo ||
3098 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3101 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3102 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3103 error "(6) stat successfully unexpectedly"
3105 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3106 $START_NAMESPACE -A -r ||
3107 error "(7) Fail to start LFSCK for namespace"
3109 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3110 mdd.${MDT_DEV}.lfsck_namespace |
3111 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3113 error "(8) unexpected status"
3116 local repaired=$($SHOW_NAMESPACE |
3117 awk '/^multiple_referenced_repaired/ { print $2 }')
3118 [ $repaired -eq 1 ] ||
3119 error "(9) Fail to repair multiple referenced name entry: $repaired"
3121 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3122 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3123 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3125 local cname="$cfid-$pfid-D-0"
3126 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3127 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3129 run_test 24 "LFSCK can repair multiple-referenced name entry"
3132 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3133 skip "Only support to inject failure on ldiskfs" && return
3136 echo "The file type in the name entry does not match the file type"
3137 echo "claimed by the referenced object. Then the LFSCK will update"
3138 echo "the file type in the name entry."
3141 check_mount_and_prep
3143 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3145 echo "Inject failure stub on MDT0 to simulate the case that"
3146 echo "the file type stored in the name entry is wrong."
3148 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3150 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3153 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3154 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3156 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3157 mdd.${MDT_DEV}.lfsck_namespace |
3158 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3160 error "(4) unexpected status"
3163 local repaired=$($SHOW_NAMESPACE |
3164 awk '/^bad_file_type_repaired/ { print $2 }')
3165 [ $repaired -eq 1 ] ||
3166 error "(5) Fail to repair bad file type in name entry: $repaired"
3168 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3170 run_test 25 "LFSCK can repair bad file type in the name entry"
3174 echo "The local name entry back referenced by the MDT-object is lost."
3175 echo "The namespace LFSCK will add the missing local name entry back"
3176 echo "to the normal namespace."
3179 check_mount_and_prep
3181 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3182 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3183 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3185 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3186 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3188 echo "Inject failure stub on MDT0 to simulate the case that"
3189 echo "foo's name entry will be removed, but the foo's object"
3190 echo "and its linkEA are kept in the system."
3192 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3193 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3194 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3197 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3199 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3200 $START_NAMESPACE -r -A ||
3201 error "(6) Fail to start LFSCK for namespace"
3203 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3204 mdd.${MDT_DEV}.lfsck_namespace |
3205 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3207 error "(7) unexpected status"
3210 local repaired=$($SHOW_NAMESPACE |
3211 awk '/^lost_dirent_repaired/ { print $2 }')
3212 [ $repaired -eq 1 ] ||
3213 error "(8) Fail to repair lost dirent: $repaired"
3215 ls -ail $DIR/$tdir/d0/foo ||
3216 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3218 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3219 [ "$foofid" == "$foofid2" ] ||
3220 error "(10) foo's FID changed: $foofid, $foofid2"
3222 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3225 [ $MDSCOUNT -lt 2 ] &&
3226 skip "We need at least 2 MDSes for this test" && return
3229 echo "The remote name entry back referenced by the MDT-object is lost."
3230 echo "The namespace LFSCK will add the missing remote name entry back"
3231 echo "to the normal namespace."
3234 check_mount_and_prep
3236 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3237 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3238 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3240 echo "Inject failure stub on MDT0 to simulate the case that"
3241 echo "foo's name entry will be removed, but the foo's object"
3242 echo "and its linkEA are kept in the system."
3244 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3246 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3249 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3251 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3252 $START_NAMESPACE -r -A ||
3253 error "(5) Fail to start LFSCK for namespace"
3255 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3256 mdd.${MDT_DEV}.lfsck_namespace |
3257 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3259 error "(6) unexpected status"
3262 local repaired=$($SHOW_NAMESPACE |
3263 awk '/^lost_dirent_repaired/ { print $2 }')
3264 [ $repaired -eq 1 ] ||
3265 error "(7) Fail to repair lost dirent: $repaired"
3267 ls -ail $DIR/$tdir/d0/foo ||
3268 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3270 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3271 [ "$foofid" == "$foofid2" ] ||
3272 error "(9) foo's FID changed: $foofid, $foofid2"
3274 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3278 echo "The local parent referenced by the MDT-object linkEA is lost."
3279 echo "The namespace LFSCK will re-create the lost parent as orphan."
3282 check_mount_and_prep
3284 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3285 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3286 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3287 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3289 echo "Inject failure stub on MDT0 to simulate the case that"
3290 echo "foo's name entry will be removed, but the foo's object"
3291 echo "and its linkEA are kept in the system. And then remove"
3292 echo "another hard link and the parent directory."
3294 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3296 rm -f $DIR/$tdir/d0/foo ||
3297 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3298 rm -f $DIR/$tdir/d0/dummy ||
3299 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3300 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3302 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3303 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3305 echo "Trigger namespace LFSCK to repair the lost parent"
3306 $START_NAMESPACE -r -A ||
3307 error "(6) Fail to start LFSCK for namespace"
3309 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3310 mdd.${MDT_DEV}.lfsck_namespace |
3311 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3313 error "(7) unexpected status"
3316 local repaired=$($SHOW_NAMESPACE |
3317 awk '/^lost_dirent_repaired/ { print $2 }')
3318 [ $repaired -eq 1 ] ||
3319 error "(8) Fail to repair lost dirent: $repaired"
3321 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3322 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3323 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3325 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3327 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3328 [ ! -z "$cname" ] ||
3329 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3331 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3334 [ $MDSCOUNT -lt 2 ] &&
3335 skip "We need at least 2 MDSes for this test" && return
3338 echo "The remote parent referenced by the MDT-object linkEA is lost."
3339 echo "The namespace LFSCK will re-create the lost parent as orphan."
3342 check_mount_and_prep
3344 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3345 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3347 $LFS path2fid $DIR/$tdir/d0
3349 echo "Inject failure stub on MDT0 to simulate the case that"
3350 echo "foo's name entry will be removed, but the foo's object"
3351 echo "and its linkEA are kept in the system. And then remove"
3352 echo "the parent directory."
3354 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3356 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3357 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3359 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3360 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3362 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3363 $START_NAMESPACE -r -A ||
3364 error "(6) Fail to start LFSCK for namespace"
3366 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3367 mdd.${MDT_DEV}.lfsck_namespace |
3368 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3370 error "(7) unexpected status"
3373 local repaired=$($SHOW_NAMESPACE |
3374 awk '/^lost_dirent_repaired/ { print $2 }')
3375 [ $repaired -eq 1 ] ||
3376 error "(8) Fail to repair lost dirent: $repaired"
3378 ls -ail $MOUNT/.lustre/lost+found/
3380 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3381 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3382 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3384 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3386 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3387 [ ! -z "$cname" ] ||
3388 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3390 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3393 [ $MDSCOUNT -lt 2 ] &&
3394 skip "The test needs at least 2 MDTs" && return
3397 echo "The target name entry is lost. The LFSCK should insert the"
3398 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3399 echo "the MDT (on which the orphan MDT-object resides) has ever"
3400 echo "failed to respond some name entry verification during the"
3401 echo "first stage-scanning, then the LFSCK should skip to handle"
3402 echo "orphan MDT-object on this MDT. But other MDTs should not"
3406 check_mount_and_prep
3407 $LFS mkdir -i 0 $DIR/$tdir/d1
3408 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3409 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3411 $LFS mkdir -i 1 $DIR/$tdir/d2
3412 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3413 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3415 echo "Inject failure stub on MDT0 to simulate the case that"
3416 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3417 echo "and its linkEA are kept in the system. And the case that"
3418 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3419 echo "and its linkEA are kept in the system."
3421 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3422 do_facet mds1 $LCTL set_param fail_loc=0x1624
3423 do_facet mds2 $LCTL set_param fail_loc=0x1624
3424 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3425 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3426 do_facet mds1 $LCTL set_param fail_loc=0
3427 do_facet mds2 $LCTL set_param fail_loc=0
3429 cancel_lru_locks mdc
3430 cancel_lru_locks osc
3432 echo "Inject failure, to simulate the MDT0 fail to handle"
3433 echo "MDT1 LFSCK request during the first-stage scanning."
3434 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3435 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3437 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3438 $START_NAMESPACE -r -A ||
3439 error "(3) Fail to start LFSCK for namespace"
3441 wait_update_facet mds1 "$LCTL get_param -n \
3442 mdd.$(facet_svc mds1).lfsck_namespace |
3443 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3444 error "(4) mds1 is not the expected 'partial'"
3447 wait_update_facet mds2 "$LCTL get_param -n \
3448 mdd.$(facet_svc mds2).lfsck_namespace |
3449 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3450 error "(5) mds2 is not the expected 'completed'"
3453 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3455 local repaired=$(do_facet mds1 $LCTL get_param -n \
3456 mdd.$(facet_svc mds1).lfsck_namespace |
3457 awk '/^lost_dirent_repaired/ { print $2 }')
3458 [ $repaired -eq 0 ] ||
3459 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3461 repaired=$(do_facet mds2 $LCTL get_param -n \
3462 mdd.$(facet_svc mds2).lfsck_namespace |
3463 awk '/^lost_dirent_repaired/ { print $2 }')
3464 [ $repaired -eq 1 ] ||
3465 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3467 echo "Trigger namespace LFSCK on all devices again to cleanup"
3468 $START_NAMESPACE -r -A ||
3469 error "(8) Fail to start LFSCK for namespace"
3471 for k in $(seq $MDSCOUNT); do
3472 # The LFSCK status query internal is 30 seconds. For the case
3473 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3474 # time to guarantee the status sync up.
3475 wait_update_facet mds${k} "$LCTL get_param -n \
3476 mdd.$(facet_svc mds${k}).lfsck_namespace |
3477 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3478 error "(9) MDS${k} is not the expected 'completed'"
3481 local repaired=$(do_facet mds1 $LCTL get_param -n \
3482 mdd.$(facet_svc mds1).lfsck_namespace |
3483 awk '/^lost_dirent_repaired/ { print $2 }')
3484 [ $repaired -eq 1 ] ||
3485 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3487 repaired=$(do_facet mds2 $LCTL get_param -n \
3488 mdd.$(facet_svc mds2).lfsck_namespace |
3489 awk '/^lost_dirent_repaired/ { print $2 }')
3490 [ $repaired -eq 0 ] ||
3491 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3493 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3497 echo "The object's nlink attribute is larger than the object's known"
3498 echo "name entries count. The LFSCK will repair the object's nlink"
3499 echo "attribute to match the known name entries count"
3502 check_mount_and_prep
3504 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3505 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3507 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3508 echo "nlink attribute is larger than its name entries count."
3510 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3511 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3512 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3513 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3514 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3516 cancel_lru_locks mdc
3517 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3518 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3520 echo "Trigger namespace LFSCK to repair the nlink count"
3521 $START_NAMESPACE -r -A ||
3522 error "(5) Fail to start LFSCK for namespace"
3524 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3525 mdd.${MDT_DEV}.lfsck_namespace |
3526 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3528 error "(6) unexpected status"
3531 local repaired=$($SHOW_NAMESPACE |
3532 awk '/^nlinks_repaired/ { print $2 }')
3533 [ $repaired -eq 1 ] ||
3534 error "(7) Fail to repair nlink count: $repaired"
3536 cancel_lru_locks mdc
3537 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3538 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3540 run_test 29a "LFSCK can repair bad nlink count (1)"
3544 echo "The object's nlink attribute is smaller than the object's known"
3545 echo "name entries count. The LFSCK will repair the object's nlink"
3546 echo "attribute to match the known name entries count"
3549 check_mount_and_prep
3551 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3552 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3554 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3555 echo "nlink attribute is smaller than its name entries count."
3557 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3558 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3559 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3560 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3561 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3563 cancel_lru_locks mdc
3564 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3565 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3567 echo "Trigger namespace LFSCK to repair the nlink count"
3568 $START_NAMESPACE -r -A ||
3569 error "(5) Fail to start LFSCK for namespace"
3571 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3572 mdd.${MDT_DEV}.lfsck_namespace |
3573 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3575 error "(6) unexpected status"
3578 local repaired=$($SHOW_NAMESPACE |
3579 awk '/^nlinks_repaired/ { print $2 }')
3580 [ $repaired -eq 1 ] ||
3581 error "(7) Fail to repair nlink count: $repaired"
3583 cancel_lru_locks mdc
3584 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3585 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3587 run_test 29b "LFSCK can repair bad nlink count (2)"
3591 echo "There are too much hard links to the object, and exceeds the
3592 echo object's linkEA limitation, as to NOT all the known name entries"
3593 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3594 echo "skip the nlink verification for this object."
3597 check_mount_and_prep
3599 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3600 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3601 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3602 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3604 echo "Inject failure stub on MDT0 to simulate the case that"
3605 echo "foo's hard links exceed the object's linkEA limitation."
3607 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3608 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3609 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3610 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3612 cancel_lru_locks mdc
3614 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3615 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3617 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3618 $LFS fid2path $DIR $foofid
3619 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3620 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3622 echo "Trigger namespace LFSCK to repair the nlink count"
3623 $START_NAMESPACE -r -A ||
3624 error "(7) Fail to start LFSCK for namespace"
3626 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3627 mdd.${MDT_DEV}.lfsck_namespace |
3628 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3630 error "(8) unexpected status"
3633 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3634 local repaired=$($SHOW_NAMESPACE |
3635 awk '/^nlinks_repaired/ { print $2 }')
3636 [ $repaired -eq 0 ] ||
3637 error "(9) Repair nlink count unexpcetedly: $repaired"
3639 cancel_lru_locks mdc
3641 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3642 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3644 count2=$($LFS fid2path $DIR $foofid | wc -l)
3645 [ $count2 -eq 2 ] ||
3646 error "(11) Repaired something unexpectedly: $count2"
3648 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3651 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3652 skip "Only support backend /lost+found for ldiskfs" && return
3655 echo "The namespace LFSCK will move the orphans from backend"
3656 echo "/lost+found directory to normal client visible namespace"
3657 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3660 check_mount_and_prep
3662 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3663 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3665 echo "Inject failure stub on MDT0 to simulate the case that"
3666 echo "directory d0 has no linkEA entry, then the LFSCK will"
3667 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3669 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3671 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3672 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3674 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3675 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3677 echo "Inject failure stub on MDT0 to simulate the case that the"
3678 echo "object's name entry will be removed, but not destroy the"
3679 echo "object. Then backend e2fsck will handle it as orphan and"
3680 echo "add them into the backend /lost+found directory."
3682 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3683 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3684 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3685 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3686 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3687 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3688 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3690 umount_client $MOUNT || error "(10) Fail to stop client!"
3692 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3695 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3696 error "(12) Fail to run e2fsck"
3698 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3699 error "(13) Fail to start MDT0"
3701 echo "Trigger namespace LFSCK to recover backend orphans"
3702 $START_NAMESPACE -r -A ||
3703 error "(14) Fail to start LFSCK for namespace"
3705 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3706 mdd.${MDT_DEV}.lfsck_namespace |
3707 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3709 error "(15) unexpected status"
3712 local repaired=$($SHOW_NAMESPACE |
3713 awk '/^local_lost_found_moved/ { print $2 }')
3714 [ $repaired -ge 4 ] ||
3715 error "(16) Fail to recover backend orphans: $repaired"
3717 mount_client $MOUNT || error "(17) Fail to start client!"
3719 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3721 ls -ail $MOUNT/.lustre/lost+found/
3723 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3724 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3725 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3727 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3729 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3730 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3732 stat ${cname}/d1 || error "(21) d0 is not recovered"
3733 stat ${cname}/f1 || error "(22) f1 is not recovered"
3735 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3738 [ $MDSCOUNT -lt 2 ] &&
3739 skip "The test needs at least 2 MDTs" && return
3742 echo "For the name entry under a striped directory, if the name"
3743 echo "hash does not match the shard, then the LFSCK will repair"
3744 echo "the bad name entry"
3747 check_mount_and_prep
3749 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3750 error "(1) Fail to create striped directory"
3752 echo "Inject failure stub on client to simulate the case that"
3753 echo "some name entry should be inserted into other non-first"
3754 echo "shard, but inserted into the first shard by wrong"
3756 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3757 $LCTL set_param fail_loc=0x1628 fail_val=0
3758 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3759 error "(2) Fail to create file under striped directory"
3760 $LCTL set_param fail_loc=0 fail_val=0
3762 echo "Trigger namespace LFSCK to repair bad name hash"
3763 $START_NAMESPACE -r -A ||
3764 error "(3) Fail to start LFSCK for namespace"
3766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3767 mdd.${MDT_DEV}.lfsck_namespace |
3768 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3770 error "(4) unexpected status"
3773 local repaired=$($SHOW_NAMESPACE |
3774 awk '/^name_hash_repaired/ { print $2 }')
3775 [ $repaired -ge 1 ] ||
3776 error "(5) Fail to repair bad name hash: $repaired"
3778 umount_client $MOUNT || error "(6) umount failed"
3779 mount_client $MOUNT || error "(7) mount failed"
3781 for ((i = 0; i < $MDSCOUNT; i++)); do
3782 stat $DIR/$tdir/striped_dir/d$i ||
3783 error "(8) Fail to stat d$i after LFSCK"
3784 rmdir $DIR/$tdir/striped_dir/d$i ||
3785 error "(9) Fail to unlink d$i after LFSCK"
3788 rmdir $DIR/$tdir/striped_dir ||
3789 error "(10) Fail to remove the striped directory after LFSCK"
3791 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3794 [ $MDSCOUNT -lt 2 ] &&
3795 skip "The test needs at least 2 MDTs" && return
3798 echo "For the name entry under a striped directory, if the name"
3799 echo "hash does not match the shard, then the LFSCK will repair"
3800 echo "the bad name entry"
3803 check_mount_and_prep
3805 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3806 error "(1) Fail to create striped directory"
3808 echo "Inject failure stub on client to simulate the case that"
3809 echo "some name entry should be inserted into other non-second"
3810 echo "shard, but inserted into the secod shard by wrong"
3812 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3813 $LCTL set_param fail_loc=0x1628 fail_val=1
3814 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3815 error "(2) Fail to create file under striped directory"
3816 $LCTL set_param fail_loc=0 fail_val=0
3818 echo "Trigger namespace LFSCK to repair bad name hash"
3819 $START_NAMESPACE -r -A ||
3820 error "(3) Fail to start LFSCK for namespace"
3822 wait_update_facet mds2 "$LCTL get_param -n \
3823 mdd.$(facet_svc mds2).lfsck_namespace |
3824 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3825 error "(4) unexpected status"
3827 local repaired=$(do_facet mds2 $LCTL get_param -n \
3828 mdd.$(facet_svc mds2).lfsck_namespace |
3829 awk '/^name_hash_repaired/ { print $2 }')
3830 [ $repaired -ge 1 ] ||
3831 error "(5) Fail to repair bad name hash: $repaired"
3833 umount_client $MOUNT || error "(6) umount failed"
3834 mount_client $MOUNT || error "(7) mount failed"
3836 for ((i = 0; i < $MDSCOUNT; i++)); do
3837 stat $DIR/$tdir/striped_dir/d$i ||
3838 error "(8) Fail to stat d$i after LFSCK"
3839 rmdir $DIR/$tdir/striped_dir/d$i ||
3840 error "(9) Fail to unlink d$i after LFSCK"
3843 rmdir $DIR/$tdir/striped_dir ||
3844 error "(10) Fail to remove the striped directory after LFSCK"
3846 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3849 [ $MDSCOUNT -lt 2 ] &&
3850 skip "The test needs at least 2 MDTs" && return
3853 echo "For some reason, the master MDT-object of the striped directory"
3854 echo "may lost its master LMV EA. If nobody created files under the"
3855 echo "master directly after the master LMV EA lost, then the LFSCK"
3856 echo "should re-generate the master LMV EA."
3859 check_mount_and_prep
3861 echo "Inject failure stub on MDT0 to simulate the case that the"
3862 echo "master MDT-object of the striped directory lost the LMV EA."
3864 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3865 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3866 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3867 error "(1) Fail to create striped directory"
3868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3870 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3871 $START_NAMESPACE -r -A ||
3872 error "(2) Fail to start LFSCK for namespace"
3874 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3875 mdd.${MDT_DEV}.lfsck_namespace |
3876 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3878 error "(3) unexpected status"
3881 local repaired=$($SHOW_NAMESPACE |
3882 awk '/^striped_dirs_repaired/ { print $2 }')
3883 [ $repaired -eq 1 ] ||
3884 error "(4) Fail to re-generate master LMV EA: $repaired"
3886 umount_client $MOUNT || error "(5) umount failed"
3887 mount_client $MOUNT || error "(6) mount failed"
3889 local empty=$(ls $DIR/$tdir/striped_dir/)
3890 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3892 rmdir $DIR/$tdir/striped_dir ||
3893 error "(8) Fail to remove the striped directory after LFSCK"
3895 run_test 31c "Re-generate the lost master LMV EA for striped directory"
3898 [ $MDSCOUNT -lt 2 ] &&
3899 skip "The test needs at least 2 MDTs" && return
3902 echo "For some reason, the master MDT-object of the striped directory"
3903 echo "may lost its master LMV EA. If somebody created files under the"
3904 echo "master directly after the master LMV EA lost, then the LFSCK"
3905 echo "should NOT re-generate the master LMV EA, instead, it should"
3906 echo "change the broken striped dirctory as read-only to prevent"
3907 echo "further damage"
3910 check_mount_and_prep
3912 echo "Inject failure stub on MDT0 to simulate the case that the"
3913 echo "master MDT-object of the striped directory lost the LMV EA."
3915 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3917 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3918 error "(1) Fail to create striped directory"
3919 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
3921 umount_client $MOUNT || error "(2) umount failed"
3922 mount_client $MOUNT || error "(3) mount failed"
3924 touch $DIR/$tdir/striped_dir/dummy ||
3925 error "(4) Fail to touch under broken striped directory"
3927 echo "Trigger namespace LFSCK to find out the inconsistency"
3928 $START_NAMESPACE -r -A ||
3929 error "(5) Fail to start LFSCK for namespace"
3931 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3932 mdd.${MDT_DEV}.lfsck_namespace |
3933 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3935 error "(6) unexpected status"
3938 local repaired=$($SHOW_NAMESPACE |
3939 awk '/^striped_dirs_repaired/ { print $2 }')
3940 [ $repaired -eq 0 ] ||
3941 error "(7) Re-generate master LMV EA unexpected: $repaired"
3943 stat $DIR/$tdir/striped_dir/dummy ||
3944 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
3946 touch $DIR/$tdir/striped_dir/foo &&
3947 error "(9) The broken striped directory should be read-only"
3949 chattr -i $DIR/$tdir/striped_dir ||
3950 error "(10) Fail to chattr on the broken striped directory"
3952 rmdir $DIR/$tdir/striped_dir ||
3953 error "(11) Fail to remove the striped directory after LFSCK"
3955 run_test 31d "Set broken striped directory (modified after broken) as read-only"
3958 [ $MDSCOUNT -lt 2 ] &&
3959 skip "The test needs at least 2 MDTs" && return
3962 echo "For some reason, the slave MDT-object of the striped directory"
3963 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
3964 echo "slave LMV EA."
3967 check_mount_and_prep
3969 echo "Inject failure stub on MDT0 to simulate the case that the"
3970 echo "slave MDT-object (that resides on the same MDT as the master"
3971 echo "MDT-object resides on) lost the LMV EA."
3973 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
3974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
3975 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3976 error "(1) Fail to create striped directory"
3977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
3979 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
3980 $START_NAMESPACE -r -A ||
3981 error "(2) Fail to start LFSCK for namespace"
3983 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3984 mdd.${MDT_DEV}.lfsck_namespace |
3985 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3987 error "(3) unexpected status"
3990 local repaired=$($SHOW_NAMESPACE |
3991 awk '/^striped_shards_repaired/ { print $2 }')
3992 [ $repaired -eq 1 ] ||
3993 error "(4) Fail to re-generate slave LMV EA: $repaired"
3995 rmdir $DIR/$tdir/striped_dir ||
3996 error "(5) Fail to remove the striped directory after LFSCK"
3998 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4001 [ $MDSCOUNT -lt 2 ] &&
4002 skip "The test needs at least 2 MDTs" && return
4005 echo "For some reason, the slave MDT-object of the striped directory"
4006 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4007 echo "slave LMV EA."
4010 check_mount_and_prep
4012 echo "Inject failure stub on MDT0 to simulate the case that the"
4013 echo "slave MDT-object (that resides on differnt MDT as the master"
4014 echo "MDT-object resides on) lost the LMV EA."
4016 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4018 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4019 error "(1) Fail to create striped directory"
4020 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4022 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4023 $START_NAMESPACE -r -A ||
4024 error "(2) Fail to start LFSCK for namespace"
4026 wait_update_facet mds2 "$LCTL get_param -n \
4027 mdd.$(facet_svc mds2).lfsck_namespace |
4028 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4029 error "(3) unexpected status"
4031 local repaired=$(do_facet mds2 $LCTL get_param -n \
4032 mdd.$(facet_svc mds2).lfsck_namespace |
4033 awk '/^striped_shards_repaired/ { print $2 }')
4034 [ $repaired -eq 1 ] ||
4035 error "(4) Fail to re-generate slave LMV EA: $repaired"
4037 rmdir $DIR/$tdir/striped_dir ||
4038 error "(5) Fail to remove the striped directory after LFSCK"
4040 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4043 [ $MDSCOUNT -lt 2 ] &&
4044 skip "The test needs at least 2 MDTs" && return
4047 echo "For some reason, the stripe index in the slave LMV EA is"
4048 echo "corrupted. The LFSCK should repair the slave LMV EA."
4051 check_mount_and_prep
4053 echo "Inject failure stub on MDT0 to simulate the case that the"
4054 echo "slave LMV EA on the first shard of the striped directory"
4055 echo "claims the same index as the second shard claims"
4057 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4058 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4059 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4060 error "(1) Fail to create striped directory"
4061 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4063 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4064 $START_NAMESPACE -r -A ||
4065 error "(2) Fail to start LFSCK for namespace"
4067 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4068 mdd.${MDT_DEV}.lfsck_namespace |
4069 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4071 error "(3) unexpected status"
4074 local repaired=$($SHOW_NAMESPACE |
4075 awk '/^striped_shards_repaired/ { print $2 }')
4076 [ $repaired -eq 1 ] ||
4077 error "(4) Fail to repair slave LMV EA: $repaired"
4079 umount_client $MOUNT || error "(5) umount failed"
4080 mount_client $MOUNT || error "(6) mount failed"
4082 touch $DIR/$tdir/striped_dir/foo ||
4083 error "(7) Fail to touch file after the LFSCK"
4085 rm -f $DIR/$tdir/striped_dir/foo ||
4086 error "(8) Fail to unlink file after the LFSCK"
4088 rmdir $DIR/$tdir/striped_dir ||
4089 error "(9) Fail to remove the striped directory after LFSCK"
4091 run_test 31g "Repair the corrupted slave LMV EA"
4094 [ $MDSCOUNT -lt 2 ] &&
4095 skip "The test needs at least 2 MDTs" && return
4098 echo "For some reason, the shard's name entry in the striped"
4099 echo "directory may be corrupted. The LFSCK should repair the"
4100 echo "bad shard's name entry."
4103 check_mount_and_prep
4105 echo "Inject failure stub on MDT0 to simulate the case that the"
4106 echo "first shard's name entry in the striped directory claims"
4107 echo "the same index as the second shard's name entry claims."
4109 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4111 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4112 error "(1) Fail to create striped directory"
4113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4115 echo "Trigger namespace LFSCK to repair the shard's name entry"
4116 $START_NAMESPACE -r -A ||
4117 error "(2) Fail to start LFSCK for namespace"
4119 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4120 mdd.${MDT_DEV}.lfsck_namespace |
4121 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4123 error "(3) unexpected status"
4126 local repaired=$($SHOW_NAMESPACE |
4127 awk '/^dirent_repaired/ { print $2 }')
4128 [ $repaired -eq 1 ] ||
4129 error "(4) Fail to repair shard's name entry: $repaired"
4131 umount_client $MOUNT || error "(5) umount failed"
4132 mount_client $MOUNT || error "(6) mount failed"
4134 touch $DIR/$tdir/striped_dir/foo ||
4135 error "(7) Fail to touch file after the LFSCK"
4137 rm -f $DIR/$tdir/striped_dir/foo ||
4138 error "(8) Fail to unlink file after the LFSCK"
4140 rmdir $DIR/$tdir/striped_dir ||
4141 error "(9) Fail to remove the striped directory after LFSCK"
4143 run_test 31h "Repair the corrupted shard's name entry"
4145 $LCTL set_param debug=-lfsck > /dev/null || true
4147 # restore MDS/OST size
4148 MDSSIZE=${SAVED_MDSSIZE}
4149 OSTSIZE=${SAVED_OSTSIZE}
4150 OSTCOUNT=${SAVED_OSTCOUNT}
4152 # cleanup the system at last