3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 #define OBD_FAIL_OST_ENOSPC 0x215
1339 do_facet ost1 $LCTL set_param fail_loc=0x215
1341 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1342 error "(2) Fail to start ost1"
1344 for ((i = 0; i < 60; i++)); do
1345 lastid2=$(do_facet ost1 "lctl get_param -n \
1346 obdfilter.${ost1_svc}.last_id" | grep $seq |
1347 awk -F: '{ print $2 }')
1348 [ ! -z $lastid2 ] && break;
1352 echo "the on-disk LAST_ID should be smaller than the expected one"
1353 [ $lastid1 -gt $lastid2 ] ||
1354 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1356 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1357 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1359 wait_update_facet ost1 "$LCTL get_param -n \
1360 obdfilter.${OST_DEV}.lfsck_layout |
1361 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1363 error "(6) unexpected status"
1366 stop ost1 || error "(7) Fail to stop ost1"
1368 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1369 error "(8) Fail to start ost1"
1371 echo "the on-disk LAST_ID should have been rebuilt"
1372 wait_update_facet ost1 "$LCTL get_param -n \
1373 obdfilter.${ost1_svc}.last_id | grep $seq |
1374 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1375 do_facet ost1 $LCTL get_param -n \
1376 obdfilter.${ost1_svc}.last_id
1377 error "(9) expect lastid1 $seq:$lastid1"
1380 do_facet ost1 $LCTL set_param fail_loc=0
1381 stopall || error "(10) Fail to stopall"
1383 run_test 11b "LFSCK can rebuild crashed last_id"
1386 [ $MDSCOUNT -lt 2 ] &&
1387 skip "We need at least 2 MDSes for test_12" && return
1389 check_mount_and_prep
1390 for k in $(seq $MDSCOUNT); do
1391 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1392 createmany -o $DIR/$tdir/${k}/f 100 ||
1393 error "(0) Fail to create 100 files."
1396 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1397 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1398 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1400 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1401 wait_all_targets namespace scanning-phase1 3
1403 echo "Stop namespace LFSCK on all targets by single lctl command."
1404 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1405 error "(4) Fail to stop LFSCK on all devices!"
1407 echo "All the LFSCK targets should be in 'stopped' status."
1408 wait_all_targets_blocked namespace stopped 5
1410 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1411 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1412 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1414 echo "All the LFSCK targets should be in 'completed' status."
1415 wait_all_targets_blocked namespace completed 7
1417 start_full_debug_logging
1419 echo "Start layout LFSCK on all targets by single command (-s 1)."
1420 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1421 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1423 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1424 wait_all_targets layout scanning-phase1 9
1426 echo "Stop layout LFSCK on all targets by single lctl command."
1427 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1428 error "(10) Fail to stop LFSCK on all devices!"
1430 echo "All the LFSCK targets should be in 'stopped' status."
1431 wait_all_targets_blocked layout stopped 11
1433 for k in $(seq $OSTCOUNT); do
1434 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1435 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1436 awk '/^status/ { print $2 }')
1437 [ "$STATUS" == "stopped" ] ||
1438 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1441 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1442 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1443 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1445 echo "All the LFSCK targets should be in 'completed' status."
1446 wait_all_targets_blocked layout completed 14
1448 stop_full_debug_logging
1450 run_test 12 "single command to trigger LFSCK on all devices"
1454 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1455 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1456 echo "MDT-object FID."
1459 check_mount_and_prep
1461 echo "Inject failure stub to simulate bad lmm_oi"
1462 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1463 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1464 createmany -o $DIR/$tdir/f 32
1465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1467 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1468 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1470 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1471 mdd.${MDT_DEV}.lfsck_layout |
1472 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1474 error "(2) unexpected status"
1477 local repaired=$($SHOW_LAYOUT |
1478 awk '/^repaired_others/ { print $2 }')
1479 [ $repaired -eq 32 ] ||
1480 error "(3) Fail to repair crashed lmm_oi: $repaired"
1482 run_test 13 "LFSCK can repair crashed lmm_oi"
1486 echo "The OST-object referenced by the MDT-object should be there;"
1487 echo "otherwise, the LFSCK should re-create the missing OST-object."
1490 check_mount_and_prep
1491 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1493 echo "Inject failure stub to simulate dangling referenced MDT-object"
1494 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1495 do_facet ost1 $LCTL set_param fail_loc=0x1610
1496 local count=$(precreated_ost_obj_count 0 0)
1498 createmany -o $DIR/$tdir/f $((count + 31))
1499 touch $DIR/$tdir/guard
1500 do_facet ost1 $LCTL set_param fail_loc=0
1502 start_full_debug_logging
1504 # exhaust other pre-created dangling cases
1505 count=$(precreated_ost_obj_count 0 0)
1506 createmany -o $DIR/$tdir/a $count ||
1507 error "(0) Fail to create $count files."
1509 echo "'ls' should fail because of dangling referenced MDT-object"
1510 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1512 echo "Trigger layout LFSCK to find out dangling reference"
1513 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1515 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1516 mdd.${MDT_DEV}.lfsck_layout |
1517 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1519 error "(3) unexpected status"
1522 local repaired=$($SHOW_LAYOUT |
1523 awk '/^repaired_dangling/ { print $2 }')
1524 [ $repaired -ge 32 ] ||
1525 error "(4) Fail to repair dangling reference: $repaired"
1527 echo "'stat' should fail because of not repair dangling by default"
1528 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1530 echo "Trigger layout LFSCK to repair dangling reference"
1531 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1533 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1534 mdd.${MDT_DEV}.lfsck_layout |
1535 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1537 error "(7) unexpected status"
1540 # There may be some async LFSCK updates in processing, wait for
1541 # a while until the target reparation has been done. LU-4970.
1543 echo "'stat' should success after layout LFSCK repairing"
1544 wait_update_facet client "stat $DIR/$tdir/guard |
1545 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1546 stat $DIR/$tdir/guard
1548 error "(8) unexpected size"
1551 repaired=$($SHOW_LAYOUT |
1552 awk '/^repaired_dangling/ { print $2 }')
1553 [ $repaired -ge 32 ] ||
1554 error "(9) Fail to repair dangling reference: $repaired"
1556 stop_full_debug_logging
1558 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1562 echo "If the OST-object referenced by the MDT-object back points"
1563 echo "to some non-exist MDT-object, then the LFSCK should repair"
1564 echo "the OST-object to back point to the right MDT-object."
1567 check_mount_and_prep
1568 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1570 echo "Inject failure stub to make the OST-object to back point to"
1571 echo "non-exist MDT-object."
1572 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1574 do_facet ost1 $LCTL set_param fail_loc=0x1611
1575 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1576 cancel_lru_locks osc
1577 do_facet ost1 $LCTL set_param fail_loc=0
1579 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1580 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1583 mdd.${MDT_DEV}.lfsck_layout |
1584 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1586 error "(2) unexpected status"
1589 local repaired=$($SHOW_LAYOUT |
1590 awk '/^repaired_unmatched_pair/ { print $2 }')
1591 [ $repaired -eq 1 ] ||
1592 error "(3) Fail to repair unmatched pair: $repaired"
1594 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1598 echo "If the OST-object referenced by the MDT-object back points"
1599 echo "to other MDT-object that doesn't recognize the OST-object,"
1600 echo "then the LFSCK should repair it to back point to the right"
1601 echo "MDT-object (the first one)."
1604 check_mount_and_prep
1605 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1606 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1607 cancel_lru_locks osc
1609 echo "Inject failure stub to make the OST-object to back point to"
1610 echo "other MDT-object"
1612 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1613 do_facet ost1 $LCTL set_param fail_loc=0x1612
1614 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1615 cancel_lru_locks osc
1616 do_facet ost1 $LCTL set_param fail_loc=0
1618 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1619 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1622 mdd.${MDT_DEV}.lfsck_layout |
1623 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1625 error "(2) unexpected status"
1628 local repaired=$($SHOW_LAYOUT |
1629 awk '/^repaired_unmatched_pair/ { print $2 }')
1630 [ $repaired -eq 1 ] ||
1631 error "(3) Fail to repair unmatched pair: $repaired"
1633 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1636 [ $MDSCOUNT -lt 2 ] &&
1637 skip "We need at least 2 MDSes for this test" && return
1639 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1640 skip "Skip the test after 2.7.55 see LU-6437" && return
1643 echo "According to current metadata migration implementation,"
1644 echo "before the old MDT-object is removed, both the new MDT-object"
1645 echo "and old MDT-object will reference the same LOV layout. Then if"
1646 echo "the layout LFSCK finds the new MDT-object by race, it will"
1647 echo "regard related OST-object(s) as multiple referenced case, and"
1648 echo "will try to create new OST-object(s) for the new MDT-object."
1649 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1650 echo "MDT-object before confirm the multiple referenced case."
1653 check_mount_and_prep
1654 $LFS mkdir -i 1 $DIR/$tdir/a1
1655 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1656 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1657 cancel_lru_locks osc
1659 echo "Inject failure stub on MDT1 to delay the migration"
1661 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1662 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1663 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1664 $LFS migrate -m 0 $DIR/$tdir/a1 &
1667 echo "Trigger layout LFSCK to race with the migration"
1668 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1670 wait_all_targets_blocked layout completed 2
1672 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1673 local repaired=$($SHOW_LAYOUT |
1674 awk '/^repaired_unmatched_pair/ { print $2 }')
1675 [ $repaired -eq 1 ] ||
1676 error "(3) Fail to repair unmatched pair: $repaired"
1678 repaired=$($SHOW_LAYOUT |
1679 awk '/^repaired_multiple_referenced/ { print $2 }')
1680 [ $repaired -eq 0 ] ||
1681 error "(4) Unexpectedly repaird multiple references: $repaired"
1683 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1687 echo "If the OST-object's owner information does not match the owner"
1688 echo "information stored in the MDT-object, then the LFSCK trust the"
1689 echo "MDT-object and update the OST-object's owner information."
1692 check_mount_and_prep
1693 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1694 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1695 cancel_lru_locks osc
1697 echo "Inject failure stub to skip OST-object owner changing"
1698 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1699 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1700 chown 1.1 $DIR/$tdir/f0
1701 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1703 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1706 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1709 mdd.${MDT_DEV}.lfsck_layout |
1710 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1712 error "(2) unexpected status"
1715 local repaired=$($SHOW_LAYOUT |
1716 awk '/^repaired_inconsistent_owner/ { print $2 }')
1717 [ $repaired -eq 1 ] ||
1718 error "(3) Fail to repair inconsistent owner: $repaired"
1720 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1724 echo "If more than one MDT-objects reference the same OST-object,"
1725 echo "and the OST-object only recognizes one MDT-object, then the"
1726 echo "LFSCK should create new OST-objects for such non-recognized"
1730 check_mount_and_prep
1731 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1733 echo "Inject failure stub to make two MDT-objects to refernce"
1734 echo "the OST-object"
1736 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1737 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1739 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1740 cancel_lru_locks osc
1742 createmany -o $DIR/$tdir/f 1
1744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1746 cancel_lru_locks mdc
1747 cancel_lru_locks osc
1749 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1750 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1751 [ $size -eq 1048576 ] ||
1752 error "(1) f0 (wrong) size should be 1048576, but got $size"
1754 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1757 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1760 mdd.${MDT_DEV}.lfsck_layout |
1761 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1763 error "(3) unexpected status"
1766 local repaired=$($SHOW_LAYOUT |
1767 awk '/^repaired_multiple_referenced/ { print $2 }')
1768 [ $repaired -eq 1 ] ||
1769 error "(4) Fail to repair multiple references: $repaired"
1771 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1772 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1773 error "(5) Fail to write f0."
1774 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1775 [ $size -eq 1048576 ] ||
1776 error "(6) guard size should be 1048576, but got $size"
1778 run_test 17 "LFSCK can repair multiple references"
1780 $LCTL set_param debug=+cache > /dev/null
1784 echo "The target MDT-object is there, but related stripe information"
1785 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1786 echo "layout EA entries."
1789 check_mount_and_prep
1790 $LFS mkdir -i 0 $DIR/$tdir/a1
1791 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1792 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1794 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1796 $LFS path2fid $DIR/$tdir/a1/f1
1797 $LFS getstripe $DIR/$tdir/a1/f1
1799 if [ $MDSCOUNT -ge 2 ]; then
1800 $LFS mkdir -i 1 $DIR/$tdir/a2
1801 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1802 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1803 $LFS path2fid $DIR/$tdir/a2/f2
1804 $LFS getstripe $DIR/$tdir/a2/f2
1807 cancel_lru_locks osc
1809 echo "Inject failure, to make the MDT-object lost its layout EA"
1810 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1811 do_facet mds1 $LCTL set_param fail_loc=0x1615
1812 chown 1.1 $DIR/$tdir/a1/f1
1814 if [ $MDSCOUNT -ge 2 ]; then
1815 do_facet mds2 $LCTL set_param fail_loc=0x1615
1816 chown 1.1 $DIR/$tdir/a2/f2
1822 do_facet mds1 $LCTL set_param fail_loc=0
1823 if [ $MDSCOUNT -ge 2 ]; then
1824 do_facet mds2 $LCTL set_param fail_loc=0
1827 cancel_lru_locks mdc
1828 cancel_lru_locks osc
1830 echo "The file size should be incorrect since layout EA is lost"
1831 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1832 [ "$cur_size" != "$saved_size" ] ||
1833 error "(1) Expect incorrect file1 size"
1835 if [ $MDSCOUNT -ge 2 ]; then
1836 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1837 [ "$cur_size" != "$saved_size" ] ||
1838 error "(2) Expect incorrect file2 size"
1841 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1842 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1844 for k in $(seq $MDSCOUNT); do
1845 # The LFSCK status query internal is 30 seconds. For the case
1846 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1847 # time to guarantee the status sync up.
1848 wait_update_facet mds${k} "$LCTL get_param -n \
1849 mdd.$(facet_svc mds${k}).lfsck_layout |
1850 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1851 error "(4) MDS${k} is not the expected 'completed'"
1854 for k in $(seq $OSTCOUNT); do
1855 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1856 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1857 awk '/^status/ { print $2 }')
1858 [ "$cur_status" == "completed" ] ||
1859 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1862 local repaired=$(do_facet mds1 $LCTL get_param -n \
1863 mdd.$(facet_svc mds1).lfsck_layout |
1864 awk '/^repaired_orphan/ { print $2 }')
1865 [ $repaired -eq 1 ] ||
1866 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1868 if [ $MDSCOUNT -ge 2 ]; then
1869 repaired=$(do_facet mds2 $LCTL get_param -n \
1870 mdd.$(facet_svc mds2).lfsck_layout |
1871 awk '/^repaired_orphan/ { print $2 }')
1872 [ $repaired -eq 2 ] ||
1873 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1876 $LFS path2fid $DIR/$tdir/a1/f1
1877 $LFS getstripe $DIR/$tdir/a1/f1
1879 if [ $MDSCOUNT -ge 2 ]; then
1880 $LFS path2fid $DIR/$tdir/a2/f2
1881 $LFS getstripe $DIR/$tdir/a2/f2
1884 echo "The file size should be correct after layout LFSCK scanning"
1885 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1886 [ "$cur_size" == "$saved_size" ] ||
1887 error "(7) Expect file1 size $saved_size, but got $cur_size"
1889 if [ $MDSCOUNT -ge 2 ]; then
1890 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1891 [ "$cur_size" == "$saved_size" ] ||
1892 error "(8) Expect file2 size $saved_size, but got $cur_size"
1895 run_test 18a "Find out orphan OST-object and repair it (1)"
1899 echo "The target MDT-object is lost. The LFSCK should re-create the"
1900 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1901 echo "can move it back to normal namespace manually."
1904 check_mount_and_prep
1905 $LFS mkdir -i 0 $DIR/$tdir/a1
1906 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1907 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1908 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1909 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1911 $LFS getstripe $DIR/$tdir/a1/f1
1913 if [ $MDSCOUNT -ge 2 ]; then
1914 $LFS mkdir -i 1 $DIR/$tdir/a2
1915 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1916 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1917 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1919 $LFS getstripe $DIR/$tdir/a2/f2
1922 cancel_lru_locks osc
1924 echo "Inject failure, to simulate the case of missing the MDT-object"
1925 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1926 do_facet mds1 $LCTL set_param fail_loc=0x1616
1927 rm -f $DIR/$tdir/a1/f1
1929 if [ $MDSCOUNT -ge 2 ]; then
1930 do_facet mds2 $LCTL set_param fail_loc=0x1616
1931 rm -f $DIR/$tdir/a2/f2
1937 do_facet mds1 $LCTL set_param fail_loc=0
1938 if [ $MDSCOUNT -ge 2 ]; then
1939 do_facet mds2 $LCTL set_param fail_loc=0
1942 cancel_lru_locks mdc
1943 cancel_lru_locks osc
1945 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1946 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1948 for k in $(seq $MDSCOUNT); do
1949 # The LFSCK status query internal is 30 seconds. For the case
1950 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1951 # time to guarantee the status sync up.
1952 wait_update_facet mds${k} "$LCTL get_param -n \
1953 mdd.$(facet_svc mds${k}).lfsck_layout |
1954 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1955 error "(2) MDS${k} is not the expected 'completed'"
1958 for k in $(seq $OSTCOUNT); do
1959 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1960 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1961 awk '/^status/ { print $2 }')
1962 [ "$cur_status" == "completed" ] ||
1963 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1966 local repaired=$(do_facet mds1 $LCTL get_param -n \
1967 mdd.$(facet_svc mds1).lfsck_layout |
1968 awk '/^repaired_orphan/ { print $2 }')
1969 [ $repaired -eq 1 ] ||
1970 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1972 if [ $MDSCOUNT -ge 2 ]; then
1973 repaired=$(do_facet mds2 $LCTL get_param -n \
1974 mdd.$(facet_svc mds2).lfsck_layout |
1975 awk '/^repaired_orphan/ { print $2 }')
1976 [ $repaired -eq 2 ] ||
1977 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1980 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1981 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1982 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1984 if [ $MDSCOUNT -ge 2 ]; then
1985 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1986 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1989 $LFS path2fid $DIR/$tdir/a1/f1
1990 $LFS getstripe $DIR/$tdir/a1/f1
1992 if [ $MDSCOUNT -ge 2 ]; then
1993 $LFS path2fid $DIR/$tdir/a2/f2
1994 $LFS getstripe $DIR/$tdir/a2/f2
1997 echo "The file size should be correct after layout LFSCK scanning"
1998 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1999 [ "$cur_size" == "$saved_size" ] ||
2000 error "(7) Expect file1 size $saved_size, but got $cur_size"
2002 if [ $MDSCOUNT -ge 2 ]; then
2003 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2004 [ "$cur_size" == "$saved_size" ] ||
2005 error "(8) Expect file2 size $saved_size, but got $cur_size"
2008 run_test 18b "Find out orphan OST-object and repair it (2)"
2012 echo "The target MDT-object is lost, and the OST-object FID is missing."
2013 echo "The LFSCK should re-create the MDT-object with new FID under the "
2014 echo "directory .lustre/lost+found/MDTxxxx."
2017 check_mount_and_prep
2018 $LFS mkdir -i 0 $DIR/$tdir/a1
2019 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2021 echo "Inject failure, to simulate the case of missing parent FID"
2022 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2023 do_facet ost1 $LCTL set_param fail_loc=0x1617
2025 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2026 $LFS getstripe $DIR/$tdir/a1/f1
2028 if [ $MDSCOUNT -ge 2 ]; then
2029 $LFS mkdir -i 1 $DIR/$tdir/a2
2030 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2031 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2032 $LFS getstripe $DIR/$tdir/a2/f2
2035 cancel_lru_locks osc
2037 echo "Inject failure, to simulate the case of missing the MDT-object"
2038 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2039 do_facet mds1 $LCTL set_param fail_loc=0x1616
2040 rm -f $DIR/$tdir/a1/f1
2042 if [ $MDSCOUNT -ge 2 ]; then
2043 do_facet mds2 $LCTL set_param fail_loc=0x1616
2044 rm -f $DIR/$tdir/a2/f2
2050 do_facet mds1 $LCTL set_param fail_loc=0
2051 if [ $MDSCOUNT -ge 2 ]; then
2052 do_facet mds2 $LCTL set_param fail_loc=0
2055 cancel_lru_locks mdc
2056 cancel_lru_locks osc
2058 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2059 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2061 for k in $(seq $MDSCOUNT); do
2062 # The LFSCK status query internal is 30 seconds. For the case
2063 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2064 # time to guarantee the status sync up.
2065 wait_update_facet mds${k} "$LCTL get_param -n \
2066 mdd.$(facet_svc mds${k}).lfsck_layout |
2067 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2068 error "(2) MDS${k} is not the expected 'completed'"
2071 for k in $(seq $OSTCOUNT); do
2072 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2073 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2074 awk '/^status/ { print $2 }')
2075 [ "$cur_status" == "completed" ] ||
2076 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2079 if [ $MDSCOUNT -ge 2 ]; then
2085 local repaired=$(do_facet mds1 $LCTL get_param -n \
2086 mdd.$(facet_svc mds1).lfsck_layout |
2087 awk '/^repaired_orphan/ { print $2 }')
2088 [ $repaired -eq $expected ] ||
2089 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2091 if [ $MDSCOUNT -ge 2 ]; then
2092 repaired=$(do_facet mds2 $LCTL get_param -n \
2093 mdd.$(facet_svc mds2).lfsck_layout |
2094 awk '/^repaired_orphan/ { print $2 }')
2095 [ $repaired -eq 0 ] ||
2096 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2099 ls -ail $MOUNT/.lustre/lost+found/
2101 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2102 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2103 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2105 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2108 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2109 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2110 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2112 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2113 [ ! -z "$cname" ] ||
2114 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2116 run_test 18c "Find out orphan OST-object and repair it (3)"
2120 echo "The target MDT-object layout EA slot is occpuied by some new"
2121 echo "created OST-object when repair dangling reference case. Such"
2122 echo "conflict OST-object has never been modified. Then when found"
2123 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2127 check_mount_and_prep
2129 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2130 echo "guard" > $DIR/$tdir/a1/f1
2131 echo "foo" > $DIR/$tdir/a1/f2
2132 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2133 $LFS path2fid $DIR/$tdir/a1/f1
2134 $LFS getstripe $DIR/$tdir/a1/f1
2135 $LFS path2fid $DIR/$tdir/a1/f2
2136 $LFS getstripe $DIR/$tdir/a1/f2
2137 cancel_lru_locks osc
2139 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2140 echo "to reference the same OST-object (which is f1's OST-obejct)."
2141 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2142 echo "dangling reference case, but f2's old OST-object is there."
2145 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2146 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2147 chown 1.1 $DIR/$tdir/a1/f2
2148 rm -f $DIR/$tdir/a1/f1
2151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2153 echo "stopall to cleanup object cache"
2156 setupall > /dev/null
2158 echo "The file size should be incorrect since dangling referenced"
2159 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2160 [ "$cur_size" != "$saved_size" ] ||
2161 error "(1) Expect incorrect file2 size"
2163 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2164 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2166 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2167 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2169 wait_update_facet mds1 "$LCTL get_param -n \
2170 mdd.$(facet_svc mds1).lfsck_layout |
2171 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2172 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2174 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2176 for k in $(seq $MDSCOUNT); do
2177 # The LFSCK status query internal is 30 seconds. For the case
2178 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2179 # time to guarantee the status sync up.
2180 wait_update_facet mds${k} "$LCTL get_param -n \
2181 mdd.$(facet_svc mds${k}).lfsck_layout |
2182 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2183 error "(3) MDS${k} is not the expected 'completed'"
2186 for k in $(seq $OSTCOUNT); do
2187 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2188 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2189 awk '/^status/ { print $2 }')
2190 [ "$cur_status" == "completed" ] ||
2191 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2194 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2195 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2196 awk '/^repaired_orphan/ { print $2 }')
2197 [ $repaired -eq 1 ] ||
2198 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2200 echo "The file size should be correct after layout LFSCK scanning"
2201 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2202 [ "$cur_size" == "$saved_size" ] ||
2203 error "(6) Expect file2 size $saved_size, but got $cur_size"
2205 echo "The LFSCK should find back the original data."
2206 cat $DIR/$tdir/a1/f2
2207 $LFS path2fid $DIR/$tdir/a1/f2
2208 $LFS getstripe $DIR/$tdir/a1/f2
2210 run_test 18d "Find out orphan OST-object and repair it (4)"
2214 echo "The target MDT-object layout EA slot is occpuied by some new"
2215 echo "created OST-object when repair dangling reference case. Such"
2216 echo "conflict OST-object has been modified by others. To keep the"
2217 echo "new data, the LFSCK will create a new file to refernece this"
2218 echo "old orphan OST-object."
2221 check_mount_and_prep
2223 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2224 echo "guard" > $DIR/$tdir/a1/f1
2225 echo "foo" > $DIR/$tdir/a1/f2
2226 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2227 $LFS path2fid $DIR/$tdir/a1/f1
2228 $LFS getstripe $DIR/$tdir/a1/f1
2229 $LFS path2fid $DIR/$tdir/a1/f2
2230 $LFS getstripe $DIR/$tdir/a1/f2
2231 cancel_lru_locks osc
2233 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2234 echo "to reference the same OST-object (which is f1's OST-obejct)."
2235 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2236 echo "dangling reference case, but f2's old OST-object is there."
2239 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2241 chown 1.1 $DIR/$tdir/a1/f2
2242 rm -f $DIR/$tdir/a1/f1
2245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2247 echo "stopall to cleanup object cache"
2250 setupall > /dev/null
2252 echo "The file size should be incorrect since dangling referenced"
2253 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2254 [ "$cur_size" != "$saved_size" ] ||
2255 error "(1) Expect incorrect file2 size"
2257 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2258 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2260 start_full_debug_logging
2262 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2263 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2265 wait_update_facet mds1 "$LCTL get_param -n \
2266 mdd.$(facet_svc mds1).lfsck_layout |
2267 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2268 error "(3) MDS1 is not the expected 'scanning-phase2'"
2270 # to guarantee all updates are synced.
2274 echo "Write new data to f2 to modify the new created OST-object."
2275 echo "dummy" >> $DIR/$tdir/a1/f2
2277 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2279 for k in $(seq $MDSCOUNT); do
2280 # The LFSCK status query internal is 30 seconds. For the case
2281 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2282 # time to guarantee the status sync up.
2283 wait_update_facet mds${k} "$LCTL get_param -n \
2284 mdd.$(facet_svc mds${k}).lfsck_layout |
2285 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2286 error "(4) MDS${k} is not the expected 'completed'"
2289 for k in $(seq $OSTCOUNT); do
2290 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2291 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2292 awk '/^status/ { print $2 }')
2293 [ "$cur_status" == "completed" ] ||
2294 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2297 stop_full_debug_logging
2299 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2300 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2301 awk '/^repaired_orphan/ { print $2 }')
2302 [ $repaired -eq 1 ] ||
2303 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2305 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2306 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2307 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2309 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2310 [ ! -z "$cname" ] ||
2311 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2313 echo "The stub file should keep the original f2 data"
2314 cur_size=$(ls -il $cname | awk '{ print $6 }')
2315 [ "$cur_size" == "$saved_size" ] ||
2316 error "(9) Expect file2 size $saved_size, but got $cur_size"
2319 $LFS path2fid $cname
2320 $LFS getstripe $cname
2322 echo "The f2 should contains new data."
2323 cat $DIR/$tdir/a1/f2
2324 $LFS path2fid $DIR/$tdir/a1/f2
2325 $LFS getstripe $DIR/$tdir/a1/f2
2327 run_test 18e "Find out orphan OST-object and repair it (5)"
2330 [ $OSTCOUNT -lt 2 ] &&
2331 skip "The test needs at least 2 OSTs" && return
2334 echo "The target MDT-object is lost. The LFSCK should re-create the"
2335 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2336 echo "to verify some OST-object(s) during the first stage-scanning,"
2337 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2338 echo "should not be affected."
2341 check_mount_and_prep
2342 $LFS mkdir -i 0 $DIR/$tdir/a1
2343 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2344 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2345 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2346 $LFS mkdir -i 0 $DIR/$tdir/a2
2347 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2348 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2349 $LFS getstripe $DIR/$tdir/a1/f1
2350 $LFS getstripe $DIR/$tdir/a2/f2
2352 if [ $MDSCOUNT -ge 2 ]; then
2353 $LFS mkdir -i 1 $DIR/$tdir/a3
2354 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2355 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2356 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2357 $LFS mkdir -i 1 $DIR/$tdir/a4
2358 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2359 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2360 $LFS getstripe $DIR/$tdir/a3/f3
2361 $LFS getstripe $DIR/$tdir/a4/f4
2364 cancel_lru_locks osc
2366 echo "Inject failure, to simulate the case of missing the MDT-object"
2367 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2368 do_facet mds1 $LCTL set_param fail_loc=0x1616
2369 rm -f $DIR/$tdir/a1/f1
2370 rm -f $DIR/$tdir/a2/f2
2372 if [ $MDSCOUNT -ge 2 ]; then
2373 do_facet mds2 $LCTL set_param fail_loc=0x1616
2374 rm -f $DIR/$tdir/a3/f3
2375 rm -f $DIR/$tdir/a4/f4
2381 do_facet mds1 $LCTL set_param fail_loc=0
2382 if [ $MDSCOUNT -ge 2 ]; then
2383 do_facet mds2 $LCTL set_param fail_loc=0
2386 cancel_lru_locks mdc
2387 cancel_lru_locks osc
2389 echo "Inject failure, to simulate the OST0 fail to handle"
2390 echo "MDT0 LFSCK request during the first-stage scanning."
2391 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2392 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2394 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2395 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2397 for k in $(seq $MDSCOUNT); do
2398 # The LFSCK status query internal is 30 seconds. For the case
2399 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2400 # time to guarantee the status sync up.
2401 wait_update_facet mds${k} "$LCTL get_param -n \
2402 mdd.$(facet_svc mds${k}).lfsck_layout |
2403 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2404 error "(2) MDS${k} is not the expected 'partial'"
2407 wait_update_facet ost1 "$LCTL get_param -n \
2408 obdfilter.$(facet_svc ost1).lfsck_layout |
2409 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2410 error "(3) OST1 is not the expected 'partial'"
2413 wait_update_facet ost2 "$LCTL get_param -n \
2414 obdfilter.$(facet_svc ost2).lfsck_layout |
2415 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2416 error "(4) OST2 is not the expected 'completed'"
2419 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2421 local repaired=$(do_facet mds1 $LCTL get_param -n \
2422 mdd.$(facet_svc mds1).lfsck_layout |
2423 awk '/^repaired_orphan/ { print $2 }')
2424 [ $repaired -eq 1 ] ||
2425 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2427 if [ $MDSCOUNT -ge 2 ]; then
2428 repaired=$(do_facet mds2 $LCTL get_param -n \
2429 mdd.$(facet_svc mds2).lfsck_layout |
2430 awk '/^repaired_orphan/ { print $2 }')
2431 [ $repaired -eq 1 ] ||
2432 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2435 echo "Trigger layout LFSCK on all devices again to cleanup"
2436 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2438 for k in $(seq $MDSCOUNT); do
2439 # The LFSCK status query internal is 30 seconds. For the case
2440 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2441 # time to guarantee the status sync up.
2442 wait_update_facet mds${k} "$LCTL get_param -n \
2443 mdd.$(facet_svc mds${k}).lfsck_layout |
2444 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2445 error "(8) MDS${k} is not the expected 'completed'"
2448 for k in $(seq $OSTCOUNT); do
2449 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2450 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2451 awk '/^status/ { print $2 }')
2452 [ "$cur_status" == "completed" ] ||
2453 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2457 local repaired=$(do_facet mds1 $LCTL get_param -n \
2458 mdd.$(facet_svc mds1).lfsck_layout |
2459 awk '/^repaired_orphan/ { print $2 }')
2460 [ $repaired -eq 2 ] ||
2461 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2463 if [ $MDSCOUNT -ge 2 ]; then
2464 repaired=$(do_facet mds2 $LCTL get_param -n \
2465 mdd.$(facet_svc mds2).lfsck_layout |
2466 awk '/^repaired_orphan/ { print $2 }')
2467 [ $repaired -eq 2 ] ||
2468 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2471 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2473 $LCTL set_param debug=-cache > /dev/null
2476 check_mount_and_prep
2477 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2479 echo "foo" > $DIR/$tdir/a0
2480 echo "guard" > $DIR/$tdir/a1
2481 cancel_lru_locks osc
2483 echo "Inject failure, then client will offer wrong parent FID when read"
2484 do_facet ost1 $LCTL set_param -n \
2485 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2486 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2487 $LCTL set_param fail_loc=0x1619
2489 echo "Read RPC with wrong parent FID should be denied"
2490 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2491 $LCTL set_param fail_loc=0
2493 run_test 19a "OST-object inconsistency self detect"
2496 check_mount_and_prep
2497 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2499 echo "Inject failure stub to make the OST-object to back point to"
2500 echo "non-exist MDT-object"
2502 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2503 do_facet ost1 $LCTL set_param fail_loc=0x1611
2504 echo "foo" > $DIR/$tdir/f0
2505 cancel_lru_locks osc
2506 do_facet ost1 $LCTL set_param fail_loc=0
2508 echo "Nothing should be fixed since self detect and repair is disabled"
2509 local repaired=$(do_facet ost1 $LCTL get_param -n \
2510 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2511 awk '/^repaired/ { print $2 }')
2512 [ $repaired -eq 0 ] ||
2513 error "(1) Expected 0 repaired, but got $repaired"
2515 echo "Read RPC with right parent FID should be accepted,"
2516 echo "and cause parent FID on OST to be fixed"
2518 do_facet ost1 $LCTL set_param -n \
2519 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2520 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2522 repaired=$(do_facet ost1 $LCTL get_param -n \
2523 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2524 awk '/^repaired/ { print $2 }')
2525 [ $repaired -eq 1 ] ||
2526 error "(3) Expected 1 repaired, but got $repaired"
2528 run_test 19b "OST-object inconsistency self repair"
2531 [ $OSTCOUNT -lt 2 ] &&
2532 skip "The test needs at least 2 OSTs" && return
2535 echo "The target MDT-object and some of its OST-object are lost."
2536 echo "The LFSCK should find out the left OST-objects and re-create"
2537 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2538 echo "with the partial OST-objects (LOV EA hole)."
2540 echo "New client can access the file with LOV EA hole via normal"
2541 echo "system tools or commands without crash the system."
2543 echo "For old client, even though it cannot access the file with"
2544 echo "LOV EA hole, it should not cause the system crash."
2547 check_mount_and_prep
2548 $LFS mkdir -i 0 $DIR/$tdir/a1
2549 if [ $OSTCOUNT -gt 2 ]; then
2550 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2553 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2557 # 256 blocks on the stripe0.
2558 # 1 block on the stripe1 for 2 OSTs case.
2559 # 256 blocks on the stripe1 for other cases.
2560 # 1 block on the stripe2 if OSTs > 2
2561 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2562 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2563 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2565 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2566 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2567 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2570 $LFS getstripe $DIR/$tdir/a1/f0
2572 $LFS getstripe $DIR/$tdir/a1/f1
2574 $LFS getstripe $DIR/$tdir/a1/f2
2576 if [ $OSTCOUNT -gt 2 ]; then
2577 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2578 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2580 $LFS getstripe $DIR/$tdir/a1/f3
2583 cancel_lru_locks osc
2585 echo "Inject failure..."
2586 echo "To simulate f0 lost MDT-object"
2587 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2588 do_facet mds1 $LCTL set_param fail_loc=0x1616
2589 rm -f $DIR/$tdir/a1/f0
2591 echo "To simulate f1 lost MDT-object and OST-object0"
2592 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2593 do_facet mds1 $LCTL set_param fail_loc=0x161a
2594 rm -f $DIR/$tdir/a1/f1
2596 echo "To simulate f2 lost MDT-object and OST-object1"
2597 do_facet mds1 $LCTL set_param fail_val=1
2598 rm -f $DIR/$tdir/a1/f2
2600 if [ $OSTCOUNT -gt 2 ]; then
2601 echo "To simulate f3 lost MDT-object and OST-object2"
2602 do_facet mds1 $LCTL set_param fail_val=2
2603 rm -f $DIR/$tdir/a1/f3
2606 umount_client $MOUNT
2609 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2611 echo "Inject failure to slow down the LFSCK on OST0"
2612 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2613 do_facet ost1 $LCTL set_param fail_loc=0x161b
2615 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2616 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2619 do_facet ost1 $LCTL set_param fail_loc=0
2621 for k in $(seq $MDSCOUNT); do
2622 # The LFSCK status query internal is 30 seconds. For the case
2623 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2624 # time to guarantee the status sync up.
2625 wait_update_facet mds${k} "$LCTL get_param -n \
2626 mdd.$(facet_svc mds${k}).lfsck_layout |
2627 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2628 error "(2) MDS${k} is not the expected 'completed'"
2631 for k in $(seq $OSTCOUNT); do
2632 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2633 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2634 awk '/^status/ { print $2 }')
2635 [ "$cur_status" == "completed" ] ||
2636 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2639 local repaired=$(do_facet mds1 $LCTL get_param -n \
2640 mdd.$(facet_svc mds1).lfsck_layout |
2641 awk '/^repaired_orphan/ { print $2 }')
2642 if [ $OSTCOUNT -gt 2 ]; then
2643 [ $repaired -eq 9 ] ||
2644 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2646 [ $repaired -eq 4 ] ||
2647 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2650 mount_client $MOUNT || error "(5.0) Fail to start client!"
2652 LOV_PATTERN_F_HOLE=0x40000000
2655 # ${fid0}-R-0 is the old f0
2657 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2658 echo "Check $name, which is the old f0"
2660 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2662 local pattern=0x$($LFS getstripe -L $name)
2663 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2664 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2666 local stripes=$($LFS getstripe -c $name)
2667 if [ $OSTCOUNT -gt 2 ]; then
2668 [ $stripes -eq 3 ] ||
2669 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2671 [ $stripes -eq 2 ] ||
2672 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2675 local size=$(stat $name | awk '/Size:/ { print $2 }')
2676 [ $size -eq $((4096 * $bcount)) ] ||
2677 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2679 cat $name > /dev/null || error "(5.5) cannot read $name"
2681 echo "dummy" >> $name || error "(5.6) cannot write $name"
2683 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2685 touch $name || error "(5.8) cannot touch $name"
2687 rm -f $name || error "(5.9) cannot unlink $name"
2690 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2692 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2693 if [ $OSTCOUNT -gt 2 ]; then
2694 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2696 echo "Check $name, it contains the old f1's stripe1"
2699 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2701 pattern=0x$($LFS getstripe -L $name)
2702 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2703 error "(6.2) expect pattern flag hole, but got $pattern"
2705 stripes=$($LFS getstripe -c $name)
2706 if [ $OSTCOUNT -gt 2 ]; then
2707 [ $stripes -eq 3 ] ||
2708 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2710 [ $stripes -eq 2 ] ||
2711 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2714 size=$(stat $name | awk '/Size:/ { print $2 }')
2715 [ $size -eq $((4096 * $bcount)) ] ||
2716 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2718 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2720 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2721 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2724 [ $failures -eq 256 ] ||
2725 error "(6.6) expect 256 IO failures, but get $failures"
2727 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2728 [ $size -eq $((4096 * $bcount)) ] ||
2729 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2731 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2732 error "(6.8) write to the LOV EA hole should fail"
2734 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2735 error "(6.9) write to normal stripe should NOT fail"
2737 echo "foo" >> $name && error "(6.10) append write $name should fail"
2739 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2741 touch $name || error "(6.12) cannot touch $name"
2743 rm -f $name || error "(6.13) cannot unlink $name"
2746 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2748 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2749 if [ $OSTCOUNT -gt 2 ]; then
2750 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2752 echo "Check $name, it contains the old f2's stripe0"
2755 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2757 pattern=0x$($LFS getstripe -L $name)
2758 stripes=$($LFS getstripe -c $name)
2759 size=$(stat $name | awk '/Size:/ { print $2 }')
2760 if [ $OSTCOUNT -gt 2 ]; then
2761 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2762 error "(7.2.1) expect pattern flag hole, but got $pattern"
2764 [ $stripes -eq 3 ] ||
2765 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2767 [ $size -eq $((4096 * $bcount)) ] ||
2768 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2770 cat $name > /dev/null &&
2771 error "(7.5.1) normal read $name should fail"
2773 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2774 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2776 [ $failures -eq 256 ] ||
2777 error "(7.6) expect 256 IO failures, but get $failures"
2779 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2780 [ $size -eq $((4096 * $bcount)) ] ||
2781 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2783 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2784 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2786 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2787 error "(7.8.1) write to normal stripe should NOT fail"
2789 echo "foo" >> $name &&
2790 error "(7.8.3) append write $name should fail"
2792 chown $RUNAS_ID:$RUNAS_GID $name ||
2793 error "(7.9.1) cannot chown on $name"
2795 touch $name || error "(7.10.1) cannot touch $name"
2797 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2798 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2800 [ $stripes -eq 1 ] ||
2801 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2804 [ $size -eq $((4096 * (256 + 0))) ] ||
2805 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2807 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2809 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2811 chown $RUNAS_ID:$RUNAS_GID $name ||
2812 error "(7.9.2) cannot chown on $name"
2814 touch $name || error "(7.10.2) cannot touch $name"
2817 rm -f $name || error "(7.11) cannot unlink $name"
2819 [ $OSTCOUNT -le 2 ] && return
2822 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2824 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2825 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2827 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2829 pattern=0x$($LFS getstripe -L $name)
2830 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2831 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2833 stripes=$($LFS getstripe -c $name)
2834 # LFSCK does not know the old f3 had 3 stripes.
2835 # It only tries to find as much as possible.
2836 # The stripe count depends on the last stripe's offset.
2837 [ $stripes -eq 2 ] ||
2838 error "(8.3) expect the stripe count is 2, but got $stripes"
2840 size=$(stat $name | awk '/Size:/ { print $2 }')
2842 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2843 error "(8.4) expect the size $((4096 * 512)), but got $size"
2845 cat $name > /dev/null || error "(8.5) cannot read $name"
2847 echo "dummy" >> $name || error "(8.6) cannot write $name"
2849 chown $RUNAS_ID:$RUNAS_GID $name ||
2850 error "(8.7) cannot chown on $name"
2852 touch $name || error "(8.8) cannot touch $name"
2854 rm -f $name || error "(8.9) cannot unlink $name"
2856 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2859 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2860 skip "ignore the test if MDS is older than 2.5.59" && return
2862 check_mount_and_prep
2863 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2865 echo "Start all LFSCK components by default (-s 1)"
2866 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2867 error "Fail to start LFSCK"
2869 echo "namespace LFSCK should be in 'scanning-phase1' status"
2870 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2871 [ "$STATUS" == "scanning-phase1" ] ||
2872 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2874 echo "layout LFSCK should be in 'scanning-phase1' status"
2875 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2876 [ "$STATUS" == "scanning-phase1" ] ||
2877 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2879 echo "Stop all LFSCK components by default"
2880 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2881 error "Fail to stop LFSCK"
2883 run_test 21 "run all LFSCK components by default"
2886 [ $MDSCOUNT -lt 2 ] &&
2887 skip "We need at least 2 MDSes for this test" && return
2890 echo "The parent_A references the child directory via some name entry,"
2891 echo "but the child directory back references another parent_B via its"
2892 echo "".." name entry. The parent_B does not exist. Then the namespace"
2893 echo "LFSCK will repair the child directory's ".." name entry."
2896 check_mount_and_prep
2898 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2899 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2901 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2902 echo "The dummy's dotdot name entry references the guard."
2903 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2904 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2905 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2906 error "(3) Fail to mkdir on MDT0"
2907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2909 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2911 echo "Trigger namespace LFSCK to repair unmatched pairs"
2912 $START_NAMESPACE -A -r ||
2913 error "(5) Fail to start LFSCK for namespace"
2915 wait_all_targets_blocked namespace completed 6
2917 local repaired=$($SHOW_NAMESPACE |
2918 awk '/^unmatched_pairs_repaired/ { print $2 }')
2919 [ $repaired -eq 1 ] ||
2920 error "(7) Fail to repair unmatched pairs: $repaired"
2922 echo "'ls' should success after namespace LFSCK repairing"
2923 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2924 error "(8) ls should success."
2926 run_test 22a "LFSCK can repair unmatched pairs (1)"
2929 [ $MDSCOUNT -lt 2 ] &&
2930 skip "We need at least 2 MDSes for this test" && return
2933 echo "The parent_A references the child directory via the name entry_B,"
2934 echo "but the child directory back references another parent_C via its"
2935 echo "".." name entry. The parent_C exists, but there is no the name"
2936 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2937 echo "the child directory's ".." name entry and its linkEA."
2940 check_mount_and_prep
2942 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2943 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2945 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2946 echo "and bad linkEA. The dummy's dotdot name entry references the"
2947 echo "guard. The dummy's linkEA references n non-exist name entry."
2948 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2949 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2950 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2951 error "(3) Fail to mkdir on MDT0"
2952 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2954 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2955 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2956 local dummyname=$($LFS fid2path $DIR $dummyfid)
2957 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2958 error "(4) fid2path works unexpectedly."
2960 echo "Trigger namespace LFSCK to repair unmatched pairs"
2961 $START_NAMESPACE -A -r ||
2962 error "(5) Fail to start LFSCK for namespace"
2964 wait_all_targets_blocked namespace completed 6
2966 local repaired=$($SHOW_NAMESPACE |
2967 awk '/^unmatched_pairs_repaired/ { print $2 }')
2968 [ $repaired -eq 1 ] ||
2969 error "(7) Fail to repair unmatched pairs: $repaired"
2971 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2972 local dummyname=$($LFS fid2path $DIR $dummyfid)
2973 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2974 error "(8) fid2path does not work"
2976 run_test 22b "LFSCK can repair unmatched pairs (2)"
2979 [ $MDSCOUNT -lt 2 ] &&
2980 skip "We need at least 2 MDSes for this test" && return
2983 echo "The name entry is there, but the MDT-object for such name "
2984 echo "entry does not exist. The namespace LFSCK should find out "
2985 echo "and repair the inconsistency as required."
2988 check_mount_and_prep
2990 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2991 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2993 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2994 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2995 do_facet mds2 $LCTL set_param fail_loc=0x1620
2996 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2997 do_facet mds2 $LCTL set_param fail_loc=0
2999 echo "'ls' should fail because of dangling name entry"
3000 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3002 echo "Trigger namespace LFSCK to find out dangling name entry"
3003 $START_NAMESPACE -A -r ||
3004 error "(5) Fail to start LFSCK for namespace"
3006 wait_all_targets_blocked namespace completed 6
3008 local repaired=$($SHOW_NAMESPACE |
3009 awk '/^dangling_repaired/ { print $2 }')
3010 [ $repaired -eq 1 ] ||
3011 error "(7) Fail to repair dangling name entry: $repaired"
3013 echo "'ls' should fail because not re-create MDT-object by default"
3014 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3016 echo "Trigger namespace LFSCK again to repair dangling name entry"
3017 $START_NAMESPACE -A -r -C ||
3018 error "(9) Fail to start LFSCK for namespace"
3020 wait_all_targets_blocked namespace completed 10
3022 repaired=$($SHOW_NAMESPACE |
3023 awk '/^dangling_repaired/ { print $2 }')
3024 [ $repaired -eq 1 ] ||
3025 error "(11) Fail to repair dangling name entry: $repaired"
3027 echo "'ls' should success after namespace LFSCK repairing"
3028 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3030 run_test 23a "LFSCK can repair dangling name entry (1)"
3034 echo "The objectA has multiple hard links, one of them corresponding"
3035 echo "to the name entry_B. But there is something wrong for the name"
3036 echo "entry_B and cause entry_B to references non-exist object_C."
3037 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3038 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3039 echo "comes to the second-stage scanning, it will find that the"
3040 echo "former re-creating object_C is not proper, and will try to"
3041 echo "replace the object_C with the real object_A."
3044 check_mount_and_prep
3046 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3047 # Trigger LFSCK firstly, that will generate the
3048 # .lustre/lost+found/MDTxxxx in advance to avoid
3049 # reusing the local object for the dangling name
3051 $START_NAMESPACE -r ||
3052 error "(0) Fail to start LFSCK for namespace"
3054 wait_all_targets_blocked namespace completed 0.1
3057 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3058 $LFS path2fid $DIR/$tdir/d0
3060 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3061 $LFS path2fid $DIR/$tdir/d0/f0
3063 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3064 $LFS path2fid $DIR/$tdir/d0/f1
3066 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3067 OID=$(printf %d $OID)
3069 if [ $OID -eq 1 ]; then
3070 # To guarantee that the f0 and f1 are in the same FID seq
3071 rm -f $DIR/$tdir/d0/f0 ||
3072 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3073 echo "dummy" > $DIR/$tdir/d0/f0 ||
3074 error "(3.2) Fail to touch on MDT0"
3075 $LFS path2fid $DIR/$tdir/d0/f0
3078 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3079 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3080 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3081 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3082 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3084 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3086 echo "'ls' should fail because of dangling name entry"
3087 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3088 error "(6) ls should fail."
3090 echo "Trigger namespace LFSCK to find out dangling name entry"
3091 $START_NAMESPACE -r -C ||
3092 error "(7) Fail to start LFSCK for namespace"
3094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3095 mdd.${MDT_DEV}.lfsck_namespace |
3096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3098 error "(8) unexpected status"
3101 local repaired=$($SHOW_NAMESPACE |
3102 awk '/^dangling_repaired/ { print $2 }')
3103 [ $repaired -eq 1 ] ||
3104 error "(9) Fail to repair dangling name entry: $repaired"
3106 repaired=$($SHOW_NAMESPACE |
3107 awk '/^multiple_linked_repaired/ { print $2 }')
3108 [ $repaired -eq 1 ] ||
3109 error "(10) Fail to drop the former created object: $repaired"
3111 local data=$(cat $DIR/$tdir/d0/foo)
3112 [ "$data" == "dummy" ] ||
3113 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3115 run_test 23b "LFSCK can repair dangling name entry (2)"
3119 echo "The objectA has multiple hard links, one of them corresponding"
3120 echo "to the name entry_B. But there is something wrong for the name"
3121 echo "entry_B and cause entry_B to references non-exist object_C."
3122 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3123 echo "as dangling, and re-create the lost object_C. And then others"
3124 echo "modified the re-created object_C. When the LFSCK comes to the"
3125 echo "second-stage scanning, it will find that the former re-creating"
3126 echo "object_C maybe wrong and try to replace the object_C with the"
3127 echo "real object_A. But because object_C has been modified, so the"
3128 echo "LFSCK cannot replace it."
3131 start_full_debug_logging
3133 check_mount_and_prep
3135 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3136 # Trigger LFSCK firstly, that will generate the
3137 # .lustre/lost+found/MDTxxxx in advance to avoid
3138 # reusing the local object for the dangling name
3140 $START_NAMESPACE -r ||
3141 error "(0) Fail to start LFSCK for namespace"
3143 wait_all_targets_blocked namespace completed 0.1
3146 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3147 $LFS path2fid $DIR/$tdir/d0
3149 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3150 $LFS path2fid $DIR/$tdir/d0/f0
3152 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3153 $LFS path2fid $DIR/$tdir/d0/f1
3155 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3156 OID=$(printf %d $OID)
3158 if [ $OID -eq 1 ]; then
3159 # To guarantee that the f0 and f1 are in the same FID seq
3160 rm -f $DIR/$tdir/d0/f0 ||
3161 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3162 echo "dummy" > $DIR/$tdir/d0/f0 ||
3163 error "(3.2) Fail to touch on MDT0"
3164 $LFS path2fid $DIR/$tdir/d0/f0
3167 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3168 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3169 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3170 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3171 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3173 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3175 echo "'ls' should fail because of dangling name entry"
3176 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3177 error "(6) ls should fail."
3179 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3180 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3182 echo "Trigger namespace LFSCK to find out dangling name entry"
3183 $START_NAMESPACE -r -C ||
3184 error "(7) Fail to start LFSCK for namespace"
3186 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3187 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3188 stat $DIR/$tdir/guard
3190 error "(8) unexpected size"
3193 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3194 cancel_lru_locks osc
3196 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3197 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3198 mdd.${MDT_DEV}.lfsck_namespace |
3199 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3201 error "(10) unexpected status"
3204 stop_full_debug_logging
3206 local repaired=$($SHOW_NAMESPACE |
3207 awk '/^dangling_repaired/ { print $2 }')
3208 [ $repaired -eq 1 ] ||
3209 error "(11) Fail to repair dangling name entry: $repaired"
3211 local data=$(cat $DIR/$tdir/d0/foo)
3212 [ "$data" != "dummy" ] ||
3213 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3215 run_test 23c "LFSCK can repair dangling name entry (3)"
3218 [ $MDSCOUNT -lt 2 ] &&
3219 skip "We need at least 2 MDSes for this test" && return
3222 echo "Two MDT-objects back reference the same name entry via their"
3223 echo "each own linkEA entry, but the name entry only references one"
3224 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3225 echo "for the MDT-object that is not recognized. If such MDT-object"
3226 echo "has no other linkEA entry after the removing, then the LFSCK"
3227 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3230 check_mount_and_prep
3232 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3234 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3235 $LFS path2fid $DIR/$tdir/d0/guard
3237 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3238 $LFS path2fid $DIR/$tdir/d0/dummy
3241 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3242 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3244 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3247 touch $DIR/$tdir/d0/guard/foo ||
3248 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3250 echo "Inject failure stub on MDT0 to simulate the case that"
3251 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3252 echo "that references $DIR/$tdir/d0/guard/foo."
3253 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3254 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3255 echo "there with the same linkEA entry as another MDT-object"
3256 echo "$DIR/$tdir/d0/guard/foo has"
3258 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3259 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3260 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3261 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3262 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3263 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3264 rmdir $DIR/$tdir/d0/dummy/foo ||
3265 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3268 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3269 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3270 error "(6) stat successfully unexpectedly"
3272 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3273 $START_NAMESPACE -A -r ||
3274 error "(7) Fail to start LFSCK for namespace"
3276 wait_all_targets_blocked namespace completed 8
3278 local repaired=$($SHOW_NAMESPACE |
3279 awk '/^multiple_referenced_repaired/ { print $2 }')
3280 [ $repaired -eq 1 ] ||
3281 error "(9) Fail to repair multiple referenced name entry: $repaired"
3283 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3284 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3285 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3287 local cname="$cfid-$pfid-D-0"
3288 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3289 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3291 run_test 24 "LFSCK can repair multiple-referenced name entry"
3294 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3295 skip "Only support to inject failure on ldiskfs" && return
3298 echo "The file type in the name entry does not match the file type"
3299 echo "claimed by the referenced object. Then the LFSCK will update"
3300 echo "the file type in the name entry."
3303 check_mount_and_prep
3305 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3307 echo "Inject failure stub on MDT0 to simulate the case that"
3308 echo "the file type stored in the name entry is wrong."
3310 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3311 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3312 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3315 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3316 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3318 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3319 mdd.${MDT_DEV}.lfsck_namespace |
3320 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3322 error "(4) unexpected status"
3325 local repaired=$($SHOW_NAMESPACE |
3326 awk '/^bad_file_type_repaired/ { print $2 }')
3327 [ $repaired -eq 1 ] ||
3328 error "(5) Fail to repair bad file type in name entry: $repaired"
3330 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3332 run_test 25 "LFSCK can repair bad file type in the name entry"
3336 echo "The local name entry back referenced by the MDT-object is lost."
3337 echo "The namespace LFSCK will add the missing local name entry back"
3338 echo "to the normal namespace."
3341 check_mount_and_prep
3343 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3344 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3345 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3347 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3348 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3350 echo "Inject failure stub on MDT0 to simulate the case that"
3351 echo "foo's name entry will be removed, but the foo's object"
3352 echo "and its linkEA are kept in the system."
3354 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3356 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3357 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3359 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3360 error "(5) 'ls' should fail"
3362 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3363 $START_NAMESPACE -r -A ||
3364 error "(6) Fail to start LFSCK for namespace"
3366 wait_all_targets_blocked namespace completed 7
3368 local repaired=$($SHOW_NAMESPACE |
3369 awk '/^lost_dirent_repaired/ { print $2 }')
3370 [ $repaired -eq 1 ] ||
3371 error "(8) Fail to repair lost dirent: $repaired"
3373 ls -ail $DIR/$tdir/d0/foo ||
3374 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3376 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3377 [ "$foofid" == "$foofid2" ] ||
3378 error "(10) foo's FID changed: $foofid, $foofid2"
3380 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3383 [ $MDSCOUNT -lt 2 ] &&
3384 skip "We need at least 2 MDSes for this test" && return
3387 echo "The remote name entry back referenced by the MDT-object is lost."
3388 echo "The namespace LFSCK will add the missing remote name entry back"
3389 echo "to the normal namespace."
3392 check_mount_and_prep
3394 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3395 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3396 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3398 echo "Inject failure stub on MDT0 to simulate the case that"
3399 echo "foo's name entry will be removed, but the foo's object"
3400 echo "and its linkEA are kept in the system."
3402 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3404 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3407 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3408 error "(4) 'ls' should fail"
3410 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3411 $START_NAMESPACE -r -A ||
3412 error "(5) Fail to start LFSCK for namespace"
3414 wait_all_targets_blocked namespace completed 6
3416 local repaired=$($SHOW_NAMESPACE |
3417 awk '/^lost_dirent_repaired/ { print $2 }')
3418 [ $repaired -eq 1 ] ||
3419 error "(7) Fail to repair lost dirent: $repaired"
3421 ls -ail $DIR/$tdir/d0/foo ||
3422 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3424 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3425 [ "$foofid" == "$foofid2" ] ||
3426 error "(9) foo's FID changed: $foofid, $foofid2"
3428 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3432 echo "The local parent referenced by the MDT-object linkEA is lost."
3433 echo "The namespace LFSCK will re-create the lost parent as orphan."
3436 check_mount_and_prep
3438 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3439 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3440 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3441 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3443 echo "Inject failure stub on MDT0 to simulate the case that"
3444 echo "foo's name entry will be removed, but the foo's object"
3445 echo "and its linkEA are kept in the system. And then remove"
3446 echo "another hard link and the parent directory."
3448 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3450 rm -f $DIR/$tdir/d0/foo ||
3451 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3452 rm -f $DIR/$tdir/d0/dummy ||
3453 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3454 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3456 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3457 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
3459 echo "Trigger namespace LFSCK to repair the lost parent"
3460 $START_NAMESPACE -r -A ||
3461 error "(6) Fail to start LFSCK for namespace"
3463 wait_all_targets_blocked namespace completed 7
3465 local repaired=$($SHOW_NAMESPACE |
3466 awk '/^lost_dirent_repaired/ { print $2 }')
3467 [ $repaired -eq 1 ] ||
3468 error "(8) Fail to repair lost dirent: $repaired"
3470 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3471 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3472 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3474 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3476 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3477 [ ! -z "$cname" ] ||
3478 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3480 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3483 [ $MDSCOUNT -lt 2 ] &&
3484 skip "We need at least 2 MDSes for this test" && return
3487 echo "The remote parent referenced by the MDT-object linkEA is lost."
3488 echo "The namespace LFSCK will re-create the lost parent as orphan."
3491 check_mount_and_prep
3493 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3494 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3496 $LFS path2fid $DIR/$tdir/d0
3498 echo "Inject failure stub on MDT0 to simulate the case that"
3499 echo "foo's name entry will be removed, but the foo's object"
3500 echo "and its linkEA are kept in the system. And then remove"
3501 echo "the parent directory."
3503 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3505 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3506 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3508 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3509 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
3511 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3512 $START_NAMESPACE -r -A ||
3513 error "(6) Fail to start LFSCK for namespace"
3515 wait_all_targets_blocked namespace completed 7
3517 local repaired=$($SHOW_NAMESPACE |
3518 awk '/^lost_dirent_repaired/ { print $2 }')
3519 [ $repaired -eq 1 ] ||
3520 error "(8) Fail to repair lost dirent: $repaired"
3522 ls -ail $MOUNT/.lustre/lost+found/
3524 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3525 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3526 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3528 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3530 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3531 [ ! -z "$cname" ] ||
3532 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3534 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3537 [ $MDSCOUNT -lt 2 ] &&
3538 skip "The test needs at least 2 MDTs" && return
3541 echo "The target name entry is lost. The LFSCK should insert the"
3542 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3543 echo "the MDT (on which the orphan MDT-object resides) has ever"
3544 echo "failed to respond some name entry verification during the"
3545 echo "first stage-scanning, then the LFSCK should skip to handle"
3546 echo "orphan MDT-object on this MDT. But other MDTs should not"
3550 check_mount_and_prep
3551 $LFS mkdir -i 0 $DIR/$tdir/d1
3552 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3553 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3555 $LFS mkdir -i 1 $DIR/$tdir/d2
3556 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3557 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3559 echo "Inject failure stub on MDT0 to simulate the case that"
3560 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3561 echo "and its linkEA are kept in the system. And the case that"
3562 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3563 echo "and its linkEA are kept in the system."
3565 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3566 do_facet mds1 $LCTL set_param fail_loc=0x1624
3567 do_facet mds2 $LCTL set_param fail_loc=0x1624
3568 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3569 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3570 do_facet mds1 $LCTL set_param fail_loc=0
3571 do_facet mds2 $LCTL set_param fail_loc=0
3573 cancel_lru_locks mdc
3574 cancel_lru_locks osc
3576 echo "Inject failure, to simulate the MDT0 fail to handle"
3577 echo "MDT1 LFSCK request during the first-stage scanning."
3578 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3579 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3581 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3582 $START_NAMESPACE -r -A ||
3583 error "(3) Fail to start LFSCK for namespace"
3585 wait_update_facet mds1 "$LCTL get_param -n \
3586 mdd.$(facet_svc mds1).lfsck_namespace |
3587 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3588 error "(4) mds1 is not the expected 'partial'"
3591 wait_update_facet mds2 "$LCTL get_param -n \
3592 mdd.$(facet_svc mds2).lfsck_namespace |
3593 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3594 error "(5) mds2 is not the expected 'completed'"
3597 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3599 local repaired=$(do_facet mds1 $LCTL get_param -n \
3600 mdd.$(facet_svc mds1).lfsck_namespace |
3601 awk '/^lost_dirent_repaired/ { print $2 }')
3602 [ $repaired -eq 0 ] ||
3603 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3605 repaired=$(do_facet mds2 $LCTL get_param -n \
3606 mdd.$(facet_svc mds2).lfsck_namespace |
3607 awk '/^lost_dirent_repaired/ { print $2 }')
3608 [ $repaired -eq 1 ] ||
3609 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3611 echo "Trigger namespace LFSCK on all devices again to cleanup"
3612 $START_NAMESPACE -r -A ||
3613 error "(8) Fail to start LFSCK for namespace"
3615 wait_all_targets_blocked namespace completed 9
3617 local repaired=$(do_facet mds1 $LCTL get_param -n \
3618 mdd.$(facet_svc mds1).lfsck_namespace |
3619 awk '/^lost_dirent_repaired/ { print $2 }')
3620 [ $repaired -eq 1 ] ||
3621 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3623 repaired=$(do_facet mds2 $LCTL get_param -n \
3624 mdd.$(facet_svc mds2).lfsck_namespace |
3625 awk '/^lost_dirent_repaired/ { print $2 }')
3626 [ $repaired -eq 0 ] ||
3627 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3629 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3633 echo "The object's nlink attribute is larger than the object's known"
3634 echo "name entries count. The LFSCK will repair the object's nlink"
3635 echo "attribute to match the known name entries count"
3638 check_mount_and_prep
3640 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3641 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3643 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3644 echo "nlink attribute is larger than its name entries count."
3646 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3647 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3648 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3649 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3650 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3652 cancel_lru_locks mdc
3653 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3654 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3656 echo "Trigger namespace LFSCK to repair the nlink count"
3657 $START_NAMESPACE -r -A ||
3658 error "(5) Fail to start LFSCK for namespace"
3660 wait_all_targets_blocked namespace completed 6
3662 local repaired=$($SHOW_NAMESPACE |
3663 awk '/^nlinks_repaired/ { print $2 }')
3664 [ $repaired -eq 1 ] ||
3665 error "(7) Fail to repair nlink count: $repaired"
3667 cancel_lru_locks mdc
3668 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3669 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3671 run_test 29a "LFSCK can repair bad nlink count (1)"
3675 echo "The object's nlink attribute is smaller than the object's known"
3676 echo "name entries count. The LFSCK will repair the object's nlink"
3677 echo "attribute to match the known name entries count"
3680 check_mount_and_prep
3682 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3683 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3685 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3686 echo "nlink attribute is smaller than its name entries count."
3688 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3689 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3690 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3691 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3692 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3694 cancel_lru_locks mdc
3695 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3696 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3698 echo "Trigger namespace LFSCK to repair the nlink count"
3699 $START_NAMESPACE -r -A ||
3700 error "(5) Fail to start LFSCK for namespace"
3702 wait_all_targets_blocked namespace completed 6
3704 local repaired=$($SHOW_NAMESPACE |
3705 awk '/^nlinks_repaired/ { print $2 }')
3706 [ $repaired -eq 1 ] ||
3707 error "(7) Fail to repair nlink count: $repaired"
3709 cancel_lru_locks mdc
3710 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3711 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3713 run_test 29b "LFSCK can repair bad nlink count (2)"
3717 echo "There are too many hard links to the object, and exceeds the"
3718 echo "object's linkEA limitation, as to NOT all the known name entries"
3719 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3720 echo "skip the nlink verification for this object."
3723 check_mount_and_prep
3725 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3726 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3727 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3728 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3730 echo "Inject failure stub on MDT0 to simulate the case that"
3731 echo "foo's hard links exceed the object's linkEA limitation."
3733 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3734 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3735 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3736 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3738 cancel_lru_locks mdc
3740 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3741 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3743 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3744 $LFS fid2path $DIR $foofid
3745 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3746 [ $count2 -eq 2 ] || error "(6) Fail to inject error: $count2"
3748 echo "Trigger namespace LFSCK to repair the nlink count"
3749 $START_NAMESPACE -r -A ||
3750 error "(7) Fail to start LFSCK for namespace"
3752 wait_all_targets_blocked namespace completed 8
3754 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3755 local repaired=$($SHOW_NAMESPACE |
3756 awk '/^nlinks_repaired/ { print $2 }')
3757 [ $repaired -eq 0 ] ||
3758 error "(9) Repair nlink count unexpcetedly: $repaired"
3760 cancel_lru_locks mdc
3762 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3763 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3765 count2=$($LFS fid2path $DIR $foofid | wc -l)
3766 [ $count2 -eq 2 ] ||
3767 error "(11) Repaired something unexpectedly: $count2"
3769 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3772 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3773 skip "Only support backend /lost+found for ldiskfs" && return
3776 echo "The namespace LFSCK will move the orphans from backend"
3777 echo "/lost+found directory to normal client visible namespace"
3778 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3781 check_mount_and_prep
3783 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3784 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3786 echo "Inject failure stub on MDT0 to simulate the case that"
3787 echo "directory d0 has no linkEA entry, then the LFSCK will"
3788 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3790 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3791 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3792 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3795 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3796 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3798 echo "Inject failure stub on MDT0 to simulate the case that the"
3799 echo "object's name entry will be removed, but not destroy the"
3800 echo "object. Then backend e2fsck will handle it as orphan and"
3801 echo "add them into the backend /lost+found directory."
3803 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3805 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3806 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3807 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3808 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3809 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3811 umount_client $MOUNT || error "(10) Fail to stop client!"
3813 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3816 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3817 error "(12) Fail to run e2fsck"
3819 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3820 error "(13) Fail to start MDT0"
3822 echo "Trigger namespace LFSCK to recover backend orphans"
3823 $START_NAMESPACE -r -A ||
3824 error "(14) Fail to start LFSCK for namespace"
3826 wait_all_targets_blocked namespace completed 15
3828 local repaired=$($SHOW_NAMESPACE |
3829 awk '/^local_lost_found_moved/ { print $2 }')
3830 [ $repaired -ge 4 ] ||
3831 error "(16) Fail to recover backend orphans: $repaired"
3833 mount_client $MOUNT || error "(17) Fail to start client!"
3835 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
3837 ls -ail $MOUNT/.lustre/lost+found/
3839 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3840 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3841 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3843 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3845 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3846 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3848 stat ${cname}/d1 || error "(21) d0 is not recovered"
3849 stat ${cname}/f1 || error "(22) f1 is not recovered"
3851 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3854 [ $MDSCOUNT -lt 2 ] &&
3855 skip "The test needs at least 2 MDTs" && return
3858 echo "For the name entry under a striped directory, if the name"
3859 echo "hash does not match the shard, then the LFSCK will repair"
3860 echo "the bad name entry"
3863 check_mount_and_prep
3865 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3866 error "(1) Fail to create striped directory"
3868 echo "Inject failure stub on client to simulate the case that"
3869 echo "some name entry should be inserted into other non-first"
3870 echo "shard, but inserted into the first shard by wrong"
3872 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3873 $LCTL set_param fail_loc=0x1628 fail_val=0
3874 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3875 error "(2) Fail to create file under striped directory"
3876 $LCTL set_param fail_loc=0 fail_val=0
3878 echo "Trigger namespace LFSCK to repair bad name hash"
3879 $START_NAMESPACE -r -A ||
3880 error "(3) Fail to start LFSCK for namespace"
3882 wait_all_targets_blocked namespace completed 4
3884 local repaired=$($SHOW_NAMESPACE |
3885 awk '/^name_hash_repaired/ { print $2 }')
3886 [ $repaired -ge 1 ] ||
3887 error "(5) Fail to repair bad name hash: $repaired"
3889 umount_client $MOUNT || error "(6) umount failed"
3890 mount_client $MOUNT || error "(7) mount failed"
3892 for ((i = 0; i < $MDSCOUNT; i++)); do
3893 stat $DIR/$tdir/striped_dir/d$i ||
3894 error "(8) Fail to stat d$i after LFSCK"
3895 rmdir $DIR/$tdir/striped_dir/d$i ||
3896 error "(9) Fail to unlink d$i after LFSCK"
3899 rmdir $DIR/$tdir/striped_dir ||
3900 error "(10) Fail to remove the striped directory after LFSCK"
3902 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3905 [ $MDSCOUNT -lt 2 ] &&
3906 skip "The test needs at least 2 MDTs" && return
3909 echo "For the name entry under a striped directory, if the name"
3910 echo "hash does not match the shard, then the LFSCK will repair"
3911 echo "the bad name entry"
3914 check_mount_and_prep
3916 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3917 error "(1) Fail to create striped directory"
3919 echo "Inject failure stub on client to simulate the case that"
3920 echo "some name entry should be inserted into other non-second"
3921 echo "shard, but inserted into the secod shard by wrong"
3923 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3924 $LCTL set_param fail_loc=0x1628 fail_val=1
3925 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3926 error "(2) Fail to create file under striped directory"
3927 $LCTL set_param fail_loc=0 fail_val=0
3929 echo "Trigger namespace LFSCK to repair bad name hash"
3930 $START_NAMESPACE -r -A ||
3931 error "(3) Fail to start LFSCK for namespace"
3933 wait_all_targets_blocked namespace completed 4
3935 local repaired=$(do_facet mds2 $LCTL get_param -n \
3936 mdd.$(facet_svc mds2).lfsck_namespace |
3937 awk '/^name_hash_repaired/ { print $2 }')
3938 [ $repaired -ge 1 ] ||
3939 error "(5) Fail to repair bad name hash: $repaired"
3941 umount_client $MOUNT || error "(6) umount failed"
3942 mount_client $MOUNT || error "(7) mount failed"
3944 for ((i = 0; i < $MDSCOUNT; i++)); do
3945 stat $DIR/$tdir/striped_dir/d$i ||
3946 error "(8) Fail to stat d$i after LFSCK"
3947 rmdir $DIR/$tdir/striped_dir/d$i ||
3948 error "(9) Fail to unlink d$i after LFSCK"
3951 rmdir $DIR/$tdir/striped_dir ||
3952 error "(10) Fail to remove the striped directory after LFSCK"
3954 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3957 [ $MDSCOUNT -lt 2 ] &&
3958 skip "The test needs at least 2 MDTs" && return
3961 echo "For some reason, the master MDT-object of the striped directory"
3962 echo "may lost its master LMV EA. If nobody created files under the"
3963 echo "master directly after the master LMV EA lost, then the LFSCK"
3964 echo "should re-generate the master LMV EA."
3967 check_mount_and_prep
3969 echo "Inject failure stub on MDT0 to simulate the case that the"
3970 echo "master MDT-object of the striped directory lost the LMV EA."
3972 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3973 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3974 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3975 error "(1) Fail to create striped directory"
3976 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3978 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3979 $START_NAMESPACE -r -A ||
3980 error "(2) Fail to start LFSCK for namespace"
3982 wait_all_targets_blocked namespace completed 3
3984 local repaired=$($SHOW_NAMESPACE |
3985 awk '/^striped_dirs_repaired/ { print $2 }')
3986 [ $repaired -eq 1 ] ||
3987 error "(4) Fail to re-generate master LMV EA: $repaired"
3989 umount_client $MOUNT || error "(5) umount failed"
3990 mount_client $MOUNT || error "(6) mount failed"
3992 local empty=$(ls $DIR/$tdir/striped_dir/)
3993 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
3995 rmdir $DIR/$tdir/striped_dir ||
3996 error "(8) Fail to remove the striped directory after LFSCK"
3998 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4001 [ $MDSCOUNT -lt 2 ] &&
4002 skip "The test needs at least 2 MDTs" && return
4005 echo "For some reason, the master MDT-object of the striped directory"
4006 echo "may lost its master LMV EA. If somebody created files under the"
4007 echo "master directly after the master LMV EA lost, then the LFSCK"
4008 echo "should NOT re-generate the master LMV EA, instead, it should"
4009 echo "change the broken striped dirctory as read-only to prevent"
4010 echo "further damage"
4013 check_mount_and_prep
4015 echo "Inject failure stub on MDT0 to simulate the case that the"
4016 echo "master MDT-object of the striped directory lost the LMV EA."
4018 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4020 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4021 error "(1) Fail to create striped directory"
4022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4024 umount_client $MOUNT || error "(2) umount failed"
4025 mount_client $MOUNT || error "(3) mount failed"
4027 touch $DIR/$tdir/striped_dir/dummy ||
4028 error "(4) Fail to touch under broken striped directory"
4030 echo "Trigger namespace LFSCK to find out the inconsistency"
4031 $START_NAMESPACE -r -A ||
4032 error "(5) Fail to start LFSCK for namespace"
4034 wait_all_targets_blocked namespace completed 6
4036 local repaired=$($SHOW_NAMESPACE |
4037 awk '/^striped_dirs_repaired/ { print $2 }')
4038 [ $repaired -eq 0 ] ||
4039 error "(7) Re-generate master LMV EA unexpected: $repaired"
4041 stat $DIR/$tdir/striped_dir/dummy ||
4042 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4044 touch $DIR/$tdir/striped_dir/foo &&
4045 error "(9) The broken striped directory should be read-only"
4047 chattr -i $DIR/$tdir/striped_dir ||
4048 error "(10) Fail to chattr on the broken striped directory"
4050 rmdir $DIR/$tdir/striped_dir ||
4051 error "(11) Fail to remove the striped directory after LFSCK"
4053 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4056 [ $MDSCOUNT -lt 2 ] &&
4057 skip "The test needs at least 2 MDTs" && return
4060 echo "For some reason, the slave MDT-object of the striped directory"
4061 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4062 echo "slave LMV EA."
4065 check_mount_and_prep
4067 echo "Inject failure stub on MDT0 to simulate the case that the"
4068 echo "slave MDT-object (that resides on the same MDT as the master"
4069 echo "MDT-object resides on) lost the LMV EA."
4071 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4072 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4073 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4074 error "(1) Fail to create striped directory"
4075 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4077 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4078 $START_NAMESPACE -r -A ||
4079 error "(2) Fail to start LFSCK for namespace"
4081 wait_all_targets_blocked namespace completed 3
4083 local repaired=$($SHOW_NAMESPACE |
4084 awk '/^striped_shards_repaired/ { print $2 }')
4085 [ $repaired -eq 1 ] ||
4086 error "(4) Fail to re-generate slave LMV EA: $repaired"
4088 rmdir $DIR/$tdir/striped_dir ||
4089 error "(5) Fail to remove the striped directory after LFSCK"
4091 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4094 [ $MDSCOUNT -lt 2 ] &&
4095 skip "The test needs at least 2 MDTs" && return
4098 echo "For some reason, the slave MDT-object of the striped directory"
4099 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4100 echo "slave LMV EA."
4103 check_mount_and_prep
4105 echo "Inject failure stub on MDT0 to simulate the case that the"
4106 echo "slave MDT-object (that resides on different MDT as the master"
4107 echo "MDT-object resides on) lost the LMV EA."
4109 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4111 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4112 error "(1) Fail to create striped directory"
4113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4115 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4116 $START_NAMESPACE -r -A ||
4117 error "(2) Fail to start LFSCK for namespace"
4119 wait_all_targets_blocked namespace completed 3
4121 local repaired=$(do_facet mds2 $LCTL get_param -n \
4122 mdd.$(facet_svc mds2).lfsck_namespace |
4123 awk '/^striped_shards_repaired/ { print $2 }')
4124 [ $repaired -eq 1 ] ||
4125 error "(4) Fail to re-generate slave LMV EA: $repaired"
4127 rmdir $DIR/$tdir/striped_dir ||
4128 error "(5) Fail to remove the striped directory after LFSCK"
4130 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4133 [ $MDSCOUNT -lt 2 ] &&
4134 skip "The test needs at least 2 MDTs" && return
4137 echo "For some reason, the stripe index in the slave LMV EA is"
4138 echo "corrupted. The LFSCK should repair the slave LMV EA."
4141 check_mount_and_prep
4143 echo "Inject failure stub on MDT0 to simulate the case that the"
4144 echo "slave LMV EA on the first shard of the striped directory"
4145 echo "claims the same index as the second shard claims"
4147 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4148 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4149 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4150 error "(1) Fail to create striped directory"
4151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4153 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4154 $START_NAMESPACE -r -A ||
4155 error "(2) Fail to start LFSCK for namespace"
4157 wait_all_targets_blocked namespace completed 3
4159 local repaired=$($SHOW_NAMESPACE |
4160 awk '/^striped_shards_repaired/ { print $2 }')
4161 [ $repaired -eq 1 ] ||
4162 error "(4) Fail to repair slave LMV EA: $repaired"
4164 umount_client $MOUNT || error "(5) umount failed"
4165 mount_client $MOUNT || error "(6) mount failed"
4167 touch $DIR/$tdir/striped_dir/foo ||
4168 error "(7) Fail to touch file after the LFSCK"
4170 rm -f $DIR/$tdir/striped_dir/foo ||
4171 error "(8) Fail to unlink file after the LFSCK"
4173 rmdir $DIR/$tdir/striped_dir ||
4174 error "(9) Fail to remove the striped directory after LFSCK"
4176 run_test 31g "Repair the corrupted slave LMV EA"
4179 [ $MDSCOUNT -lt 2 ] &&
4180 skip "The test needs at least 2 MDTs" && return
4183 echo "For some reason, the shard's name entry in the striped"
4184 echo "directory may be corrupted. The LFSCK should repair the"
4185 echo "bad shard's name entry."
4188 check_mount_and_prep
4190 echo "Inject failure stub on MDT0 to simulate the case that the"
4191 echo "first shard's name entry in the striped directory claims"
4192 echo "the same index as the second shard's name entry claims."
4194 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4196 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4197 error "(1) Fail to create striped directory"
4198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4200 echo "Trigger namespace LFSCK to repair the shard's name entry"
4201 $START_NAMESPACE -r -A ||
4202 error "(2) Fail to start LFSCK for namespace"
4204 wait_all_targets_blocked namespace completed 3
4206 local repaired=$($SHOW_NAMESPACE |
4207 awk '/^dirent_repaired/ { print $2 }')
4208 [ $repaired -eq 1 ] ||
4209 error "(4) Fail to repair shard's name entry: $repaired"
4211 umount_client $MOUNT || error "(5) umount failed"
4212 mount_client $MOUNT || error "(6) mount failed"
4214 touch $DIR/$tdir/striped_dir/foo ||
4215 error "(7) Fail to touch file after the LFSCK"
4217 rm -f $DIR/$tdir/striped_dir/foo ||
4218 error "(8) Fail to unlink file after the LFSCK"
4220 rmdir $DIR/$tdir/striped_dir ||
4221 error "(9) Fail to remove the striped directory after LFSCK"
4223 run_test 31h "Repair the corrupted shard's name entry"
4228 umount_client $MOUNT
4230 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4231 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4232 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4234 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4235 [ "$STATUS" == "scanning-phase1" ] ||
4236 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4239 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4245 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
4247 run_test 32 "stop LFSCK when some OST failed"
4249 # restore MDS/OST size
4250 MDSSIZE=${SAVED_MDSSIZE}
4251 OSTSIZE=${SAVED_OSTSIZE}
4252 OSTCOUNT=${SAVED_OSTCOUNT}
4254 # cleanup the system at last