3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 #define OBD_FAIL_OST_ENOSPC 0x215
1339 do_facet ost1 $LCTL set_param fail_loc=0x215
1341 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1342 error "(2) Fail to start ost1"
1344 for ((i = 0; i < 60; i++)); do
1345 lastid2=$(do_facet ost1 "lctl get_param -n \
1346 obdfilter.${ost1_svc}.last_id" | grep $seq |
1347 awk -F: '{ print $2 }')
1348 [ ! -z $lastid2 ] && break;
1352 echo "the on-disk LAST_ID should be smaller than the expected one"
1353 [ $lastid1 -gt $lastid2 ] ||
1354 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1356 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1357 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1359 wait_update_facet ost1 "$LCTL get_param -n \
1360 obdfilter.${OST_DEV}.lfsck_layout |
1361 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1363 error "(6) unexpected status"
1366 stop ost1 || error "(7) Fail to stop ost1"
1368 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1369 error "(8) Fail to start ost1"
1371 echo "the on-disk LAST_ID should have been rebuilt"
1372 wait_update_facet ost1 "$LCTL get_param -n \
1373 obdfilter.${ost1_svc}.last_id | grep $seq |
1374 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1375 do_facet ost1 $LCTL get_param -n \
1376 obdfilter.${ost1_svc}.last_id
1377 error "(9) expect lastid1 $seq:$lastid1"
1380 do_facet ost1 $LCTL set_param fail_loc=0
1381 stopall || error "(10) Fail to stopall"
1383 run_test 11b "LFSCK can rebuild crashed last_id"
1386 [ $MDSCOUNT -lt 2 ] &&
1387 skip "We need at least 2 MDSes for test_12a" && return
1389 check_mount_and_prep
1390 for k in $(seq $MDSCOUNT); do
1391 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1392 createmany -o $DIR/$tdir/${k}/f 100 ||
1393 error "(0) Fail to create 100 files."
1396 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1397 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1398 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1400 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1401 wait_all_targets namespace scanning-phase1 3
1403 echo "Stop namespace LFSCK on all targets by single lctl command."
1404 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1405 error "(4) Fail to stop LFSCK on all devices!"
1407 echo "All the LFSCK targets should be in 'stopped' status."
1408 wait_all_targets_blocked namespace stopped 5
1410 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1411 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1412 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1414 echo "All the LFSCK targets should be in 'completed' status."
1415 wait_all_targets_blocked namespace completed 7
1417 start_full_debug_logging
1419 echo "Start layout LFSCK on all targets by single command (-s 1)."
1420 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1421 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1423 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1424 wait_all_targets layout scanning-phase1 9
1426 echo "Stop layout LFSCK on all targets by single lctl command."
1427 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1428 error "(10) Fail to stop LFSCK on all devices!"
1430 echo "All the LFSCK targets should be in 'stopped' status."
1431 wait_all_targets_blocked layout stopped 11
1433 for k in $(seq $OSTCOUNT); do
1434 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1435 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1436 awk '/^status/ { print $2 }')
1437 [ "$STATUS" == "stopped" ] ||
1438 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1441 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1442 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1443 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1445 echo "All the LFSCK targets should be in 'completed' status."
1446 wait_all_targets_blocked layout completed 14
1448 stop_full_debug_logging
1450 run_test 12a "single command to trigger LFSCK on all devices"
1453 check_mount_and_prep
1455 echo "Start LFSCK without '-M' specified."
1456 do_facet mds1 $LCTL lfsck_start -A -r ||
1457 error "(0) Fail to start LFSCK without '-M'"
1459 wait_all_targets_blocked namespace completed 1
1460 wait_all_targets_blocked layout completed 2
1462 local count=$(do_facet mds1 $LCTL dl |
1463 awk '{ print $3 }' | grep mdt | wc -l)
1464 if [ $count -gt 1 ]; then
1466 echo "Start layout LFSCK on the node with multipe targets,"
1467 echo "but not specify '-M'/'-A' option. Should get failure."
1469 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1470 error "(3) Start layout LFSCK should fail" || true
1473 run_test 12b "auto detect Lustre device"
1477 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1478 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1479 echo "MDT-object FID."
1482 check_mount_and_prep
1484 echo "Inject failure stub to simulate bad lmm_oi"
1485 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1486 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1487 createmany -o $DIR/$tdir/f 32
1488 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1490 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1491 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1493 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1494 mdd.${MDT_DEV}.lfsck_layout |
1495 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1497 error "(2) unexpected status"
1500 local repaired=$($SHOW_LAYOUT |
1501 awk '/^repaired_others/ { print $2 }')
1502 [ $repaired -eq 32 ] ||
1503 error "(3) Fail to repair crashed lmm_oi: $repaired"
1505 run_test 13 "LFSCK can repair crashed lmm_oi"
1509 echo "The OST-object referenced by the MDT-object should be there;"
1510 echo "otherwise, the LFSCK should re-create the missing OST-object."
1513 check_mount_and_prep
1514 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1516 echo "Inject failure stub to simulate dangling referenced MDT-object"
1517 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1518 do_facet ost1 $LCTL set_param fail_loc=0x1610
1519 local count=$(precreated_ost_obj_count 0 0)
1521 createmany -o $DIR/$tdir/f $((count + 31))
1522 touch $DIR/$tdir/guard
1523 do_facet ost1 $LCTL set_param fail_loc=0
1525 start_full_debug_logging
1527 # exhaust other pre-created dangling cases
1528 count=$(precreated_ost_obj_count 0 0)
1529 createmany -o $DIR/$tdir/a $count ||
1530 error "(0) Fail to create $count files."
1532 echo "'ls' should fail because of dangling referenced MDT-object"
1533 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1535 echo "Trigger layout LFSCK to find out dangling reference"
1536 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1538 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1539 mdd.${MDT_DEV}.lfsck_layout |
1540 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1542 error "(3) unexpected status"
1545 local repaired=$($SHOW_LAYOUT |
1546 awk '/^repaired_dangling/ { print $2 }')
1547 [ $repaired -ge 32 ] ||
1548 error "(4) Fail to repair dangling reference: $repaired"
1550 echo "'stat' should fail because of not repair dangling by default"
1551 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1553 echo "Trigger layout LFSCK to repair dangling reference"
1554 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1557 mdd.${MDT_DEV}.lfsck_layout |
1558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1560 error "(7) unexpected status"
1563 # There may be some async LFSCK updates in processing, wait for
1564 # a while until the target reparation has been done. LU-4970.
1566 echo "'stat' should success after layout LFSCK repairing"
1567 wait_update_facet client "stat $DIR/$tdir/guard |
1568 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1569 stat $DIR/$tdir/guard
1571 error "(8) unexpected size"
1574 repaired=$($SHOW_LAYOUT |
1575 awk '/^repaired_dangling/ { print $2 }')
1576 [ $repaired -ge 32 ] ||
1577 error "(9) Fail to repair dangling reference: $repaired"
1579 stop_full_debug_logging
1581 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1585 echo "If the OST-object referenced by the MDT-object back points"
1586 echo "to some non-exist MDT-object, then the LFSCK should repair"
1587 echo "the OST-object to back point to the right MDT-object."
1590 check_mount_and_prep
1591 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1593 echo "Inject failure stub to make the OST-object to back point to"
1594 echo "non-exist MDT-object."
1595 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1597 do_facet ost1 $LCTL set_param fail_loc=0x1611
1598 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1599 cancel_lru_locks osc
1600 do_facet ost1 $LCTL set_param fail_loc=0
1602 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1603 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1605 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1606 mdd.${MDT_DEV}.lfsck_layout |
1607 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1609 error "(2) unexpected status"
1612 local repaired=$($SHOW_LAYOUT |
1613 awk '/^repaired_unmatched_pair/ { print $2 }')
1614 [ $repaired -eq 1 ] ||
1615 error "(3) Fail to repair unmatched pair: $repaired"
1617 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1621 echo "If the OST-object referenced by the MDT-object back points"
1622 echo "to other MDT-object that doesn't recognize the OST-object,"
1623 echo "then the LFSCK should repair it to back point to the right"
1624 echo "MDT-object (the first one)."
1627 check_mount_and_prep
1628 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1629 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1630 cancel_lru_locks osc
1632 echo "Inject failure stub to make the OST-object to back point to"
1633 echo "other MDT-object"
1635 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1636 do_facet ost1 $LCTL set_param fail_loc=0x1612
1637 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1638 cancel_lru_locks osc
1639 do_facet ost1 $LCTL set_param fail_loc=0
1641 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1642 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1644 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1645 mdd.${MDT_DEV}.lfsck_layout |
1646 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1648 error "(2) unexpected status"
1651 local repaired=$($SHOW_LAYOUT |
1652 awk '/^repaired_unmatched_pair/ { print $2 }')
1653 [ $repaired -eq 1 ] ||
1654 error "(3) Fail to repair unmatched pair: $repaired"
1656 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1659 [ $MDSCOUNT -lt 2 ] &&
1660 skip "We need at least 2 MDSes for this test" && return
1662 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1663 skip "Skip the test after 2.7.55 see LU-6437" && return
1666 echo "According to current metadata migration implementation,"
1667 echo "before the old MDT-object is removed, both the new MDT-object"
1668 echo "and old MDT-object will reference the same LOV layout. Then if"
1669 echo "the layout LFSCK finds the new MDT-object by race, it will"
1670 echo "regard related OST-object(s) as multiple referenced case, and"
1671 echo "will try to create new OST-object(s) for the new MDT-object."
1672 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1673 echo "MDT-object before confirm the multiple referenced case."
1676 check_mount_and_prep
1677 $LFS mkdir -i 1 $DIR/$tdir/a1
1678 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1679 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1680 cancel_lru_locks osc
1682 echo "Inject failure stub on MDT1 to delay the migration"
1684 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1685 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1686 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1687 $LFS migrate -m 0 $DIR/$tdir/a1 &
1690 echo "Trigger layout LFSCK to race with the migration"
1691 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1693 wait_all_targets_blocked layout completed 2
1695 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1696 local repaired=$($SHOW_LAYOUT |
1697 awk '/^repaired_unmatched_pair/ { print $2 }')
1698 [ $repaired -eq 1 ] ||
1699 error "(3) Fail to repair unmatched pair: $repaired"
1701 repaired=$($SHOW_LAYOUT |
1702 awk '/^repaired_multiple_referenced/ { print $2 }')
1703 [ $repaired -eq 0 ] ||
1704 error "(4) Unexpectedly repaird multiple references: $repaired"
1706 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1710 echo "If the OST-object's owner information does not match the owner"
1711 echo "information stored in the MDT-object, then the LFSCK trust the"
1712 echo "MDT-object and update the OST-object's owner information."
1715 check_mount_and_prep
1716 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1717 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1718 cancel_lru_locks osc
1720 echo "Inject failure stub to skip OST-object owner changing"
1721 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1722 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1723 chown 1.1 $DIR/$tdir/f0
1724 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1726 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1729 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1731 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1732 mdd.${MDT_DEV}.lfsck_layout |
1733 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1735 error "(2) unexpected status"
1738 local repaired=$($SHOW_LAYOUT |
1739 awk '/^repaired_inconsistent_owner/ { print $2 }')
1740 [ $repaired -eq 1 ] ||
1741 error "(3) Fail to repair inconsistent owner: $repaired"
1743 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1747 echo "If more than one MDT-objects reference the same OST-object,"
1748 echo "and the OST-object only recognizes one MDT-object, then the"
1749 echo "LFSCK should create new OST-objects for such non-recognized"
1753 check_mount_and_prep
1754 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1756 echo "Inject failure stub to make two MDT-objects to refernce"
1757 echo "the OST-object"
1759 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1760 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1762 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1763 cancel_lru_locks osc
1765 createmany -o $DIR/$tdir/f 1
1767 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1769 cancel_lru_locks mdc
1770 cancel_lru_locks osc
1772 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1773 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1774 [ $size -eq 1048576 ] ||
1775 error "(1) f0 (wrong) size should be 1048576, but got $size"
1777 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1780 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1782 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1783 mdd.${MDT_DEV}.lfsck_layout |
1784 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1786 error "(3) unexpected status"
1789 local repaired=$($SHOW_LAYOUT |
1790 awk '/^repaired_multiple_referenced/ { print $2 }')
1791 [ $repaired -eq 1 ] ||
1792 error "(4) Fail to repair multiple references: $repaired"
1794 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1795 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1796 error "(5) Fail to write f0."
1797 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1798 [ $size -eq 1048576 ] ||
1799 error "(6) guard size should be 1048576, but got $size"
1801 run_test 17 "LFSCK can repair multiple references"
1803 $LCTL set_param debug=+cache > /dev/null
1807 echo "The target MDT-object is there, but related stripe information"
1808 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1809 echo "layout EA entries."
1812 check_mount_and_prep
1813 $LFS mkdir -i 0 $DIR/$tdir/a1
1814 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1815 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1817 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1819 $LFS path2fid $DIR/$tdir/a1/f1
1820 $LFS getstripe $DIR/$tdir/a1/f1
1822 if [ $MDSCOUNT -ge 2 ]; then
1823 $LFS mkdir -i 1 $DIR/$tdir/a2
1824 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1825 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1826 $LFS path2fid $DIR/$tdir/a2/f2
1827 $LFS getstripe $DIR/$tdir/a2/f2
1830 cancel_lru_locks osc
1832 echo "Inject failure, to make the MDT-object lost its layout EA"
1833 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1834 do_facet mds1 $LCTL set_param fail_loc=0x1615
1835 chown 1.1 $DIR/$tdir/a1/f1
1837 if [ $MDSCOUNT -ge 2 ]; then
1838 do_facet mds2 $LCTL set_param fail_loc=0x1615
1839 chown 1.1 $DIR/$tdir/a2/f2
1845 do_facet mds1 $LCTL set_param fail_loc=0
1846 if [ $MDSCOUNT -ge 2 ]; then
1847 do_facet mds2 $LCTL set_param fail_loc=0
1850 cancel_lru_locks mdc
1851 cancel_lru_locks osc
1853 echo "The file size should be incorrect since layout EA is lost"
1854 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1855 [ "$cur_size" != "$saved_size" ] ||
1856 error "(1) Expect incorrect file1 size"
1858 if [ $MDSCOUNT -ge 2 ]; then
1859 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1860 [ "$cur_size" != "$saved_size" ] ||
1861 error "(2) Expect incorrect file2 size"
1864 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1865 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1867 for k in $(seq $MDSCOUNT); do
1868 # The LFSCK status query internal is 30 seconds. For the case
1869 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1870 # time to guarantee the status sync up.
1871 wait_update_facet mds${k} "$LCTL get_param -n \
1872 mdd.$(facet_svc mds${k}).lfsck_layout |
1873 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1874 error "(4) MDS${k} is not the expected 'completed'"
1877 for k in $(seq $OSTCOUNT); do
1878 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1879 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1880 awk '/^status/ { print $2 }')
1881 [ "$cur_status" == "completed" ] ||
1882 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1885 local repaired=$(do_facet mds1 $LCTL get_param -n \
1886 mdd.$(facet_svc mds1).lfsck_layout |
1887 awk '/^repaired_orphan/ { print $2 }')
1888 [ $repaired -eq 1 ] ||
1889 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1891 if [ $MDSCOUNT -ge 2 ]; then
1892 repaired=$(do_facet mds2 $LCTL get_param -n \
1893 mdd.$(facet_svc mds2).lfsck_layout |
1894 awk '/^repaired_orphan/ { print $2 }')
1895 [ $repaired -eq 2 ] ||
1896 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1899 $LFS path2fid $DIR/$tdir/a1/f1
1900 $LFS getstripe $DIR/$tdir/a1/f1
1902 if [ $MDSCOUNT -ge 2 ]; then
1903 $LFS path2fid $DIR/$tdir/a2/f2
1904 $LFS getstripe $DIR/$tdir/a2/f2
1907 echo "The file size should be correct after layout LFSCK scanning"
1908 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1909 [ "$cur_size" == "$saved_size" ] ||
1910 error "(7) Expect file1 size $saved_size, but got $cur_size"
1912 if [ $MDSCOUNT -ge 2 ]; then
1913 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1914 [ "$cur_size" == "$saved_size" ] ||
1915 error "(8) Expect file2 size $saved_size, but got $cur_size"
1918 run_test 18a "Find out orphan OST-object and repair it (1)"
1922 echo "The target MDT-object is lost. The LFSCK should re-create the"
1923 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1924 echo "can move it back to normal namespace manually."
1927 check_mount_and_prep
1928 $LFS mkdir -i 0 $DIR/$tdir/a1
1929 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1930 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1931 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1932 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1934 $LFS getstripe $DIR/$tdir/a1/f1
1936 if [ $MDSCOUNT -ge 2 ]; then
1937 $LFS mkdir -i 1 $DIR/$tdir/a2
1938 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1939 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1940 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1942 $LFS getstripe $DIR/$tdir/a2/f2
1945 cancel_lru_locks osc
1947 echo "Inject failure, to simulate the case of missing the MDT-object"
1948 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1949 do_facet mds1 $LCTL set_param fail_loc=0x1616
1950 rm -f $DIR/$tdir/a1/f1
1952 if [ $MDSCOUNT -ge 2 ]; then
1953 do_facet mds2 $LCTL set_param fail_loc=0x1616
1954 rm -f $DIR/$tdir/a2/f2
1960 do_facet mds1 $LCTL set_param fail_loc=0
1961 if [ $MDSCOUNT -ge 2 ]; then
1962 do_facet mds2 $LCTL set_param fail_loc=0
1965 cancel_lru_locks mdc
1966 cancel_lru_locks osc
1968 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1969 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1971 for k in $(seq $MDSCOUNT); do
1972 # The LFSCK status query internal is 30 seconds. For the case
1973 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1974 # time to guarantee the status sync up.
1975 wait_update_facet mds${k} "$LCTL get_param -n \
1976 mdd.$(facet_svc mds${k}).lfsck_layout |
1977 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1978 error "(2) MDS${k} is not the expected 'completed'"
1981 for k in $(seq $OSTCOUNT); do
1982 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1983 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1984 awk '/^status/ { print $2 }')
1985 [ "$cur_status" == "completed" ] ||
1986 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1989 local repaired=$(do_facet mds1 $LCTL get_param -n \
1990 mdd.$(facet_svc mds1).lfsck_layout |
1991 awk '/^repaired_orphan/ { print $2 }')
1992 [ $repaired -eq 1 ] ||
1993 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1995 if [ $MDSCOUNT -ge 2 ]; then
1996 repaired=$(do_facet mds2 $LCTL get_param -n \
1997 mdd.$(facet_svc mds2).lfsck_layout |
1998 awk '/^repaired_orphan/ { print $2 }')
1999 [ $repaired -eq 2 ] ||
2000 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2003 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2004 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2005 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2007 if [ $MDSCOUNT -ge 2 ]; then
2008 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2009 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2012 $LFS path2fid $DIR/$tdir/a1/f1
2013 $LFS getstripe $DIR/$tdir/a1/f1
2015 if [ $MDSCOUNT -ge 2 ]; then
2016 $LFS path2fid $DIR/$tdir/a2/f2
2017 $LFS getstripe $DIR/$tdir/a2/f2
2020 echo "The file size should be correct after layout LFSCK scanning"
2021 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2022 [ "$cur_size" == "$saved_size" ] ||
2023 error "(7) Expect file1 size $saved_size, but got $cur_size"
2025 if [ $MDSCOUNT -ge 2 ]; then
2026 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2027 [ "$cur_size" == "$saved_size" ] ||
2028 error "(8) Expect file2 size $saved_size, but got $cur_size"
2031 run_test 18b "Find out orphan OST-object and repair it (2)"
2035 echo "The target MDT-object is lost, and the OST-object FID is missing."
2036 echo "The LFSCK should re-create the MDT-object with new FID under the "
2037 echo "directory .lustre/lost+found/MDTxxxx."
2040 check_mount_and_prep
2041 $LFS mkdir -i 0 $DIR/$tdir/a1
2042 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2044 echo "Inject failure, to simulate the case of missing parent FID"
2045 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2046 do_facet ost1 $LCTL set_param fail_loc=0x1617
2048 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2049 $LFS getstripe $DIR/$tdir/a1/f1
2051 if [ $MDSCOUNT -ge 2 ]; then
2052 $LFS mkdir -i 1 $DIR/$tdir/a2
2053 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2054 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2055 $LFS getstripe $DIR/$tdir/a2/f2
2058 cancel_lru_locks osc
2060 echo "Inject failure, to simulate the case of missing the MDT-object"
2061 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2062 do_facet mds1 $LCTL set_param fail_loc=0x1616
2063 rm -f $DIR/$tdir/a1/f1
2065 if [ $MDSCOUNT -ge 2 ]; then
2066 do_facet mds2 $LCTL set_param fail_loc=0x1616
2067 rm -f $DIR/$tdir/a2/f2
2073 do_facet mds1 $LCTL set_param fail_loc=0
2074 if [ $MDSCOUNT -ge 2 ]; then
2075 do_facet mds2 $LCTL set_param fail_loc=0
2078 cancel_lru_locks mdc
2079 cancel_lru_locks osc
2081 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2082 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2084 for k in $(seq $MDSCOUNT); do
2085 # The LFSCK status query internal is 30 seconds. For the case
2086 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2087 # time to guarantee the status sync up.
2088 wait_update_facet mds${k} "$LCTL get_param -n \
2089 mdd.$(facet_svc mds${k}).lfsck_layout |
2090 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2091 error "(2) MDS${k} is not the expected 'completed'"
2094 for k in $(seq $OSTCOUNT); do
2095 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2096 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2097 awk '/^status/ { print $2 }')
2098 [ "$cur_status" == "completed" ] ||
2099 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2102 if [ $MDSCOUNT -ge 2 ]; then
2108 local repaired=$(do_facet mds1 $LCTL get_param -n \
2109 mdd.$(facet_svc mds1).lfsck_layout |
2110 awk '/^repaired_orphan/ { print $2 }')
2111 [ $repaired -eq $expected ] ||
2112 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2114 if [ $MDSCOUNT -ge 2 ]; then
2115 repaired=$(do_facet mds2 $LCTL get_param -n \
2116 mdd.$(facet_svc mds2).lfsck_layout |
2117 awk '/^repaired_orphan/ { print $2 }')
2118 [ $repaired -eq 0 ] ||
2119 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2122 ls -ail $MOUNT/.lustre/lost+found/
2124 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2125 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2126 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2128 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2131 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2132 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2133 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2135 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2136 [ ! -z "$cname" ] ||
2137 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2139 run_test 18c "Find out orphan OST-object and repair it (3)"
2143 echo "The target MDT-object layout EA slot is occpuied by some new"
2144 echo "created OST-object when repair dangling reference case. Such"
2145 echo "conflict OST-object has never been modified. Then when found"
2146 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2150 check_mount_and_prep
2152 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2153 echo "guard" > $DIR/$tdir/a1/f1
2154 echo "foo" > $DIR/$tdir/a1/f2
2155 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2156 $LFS path2fid $DIR/$tdir/a1/f1
2157 $LFS getstripe $DIR/$tdir/a1/f1
2158 $LFS path2fid $DIR/$tdir/a1/f2
2159 $LFS getstripe $DIR/$tdir/a1/f2
2160 cancel_lru_locks osc
2162 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2163 echo "to reference the same OST-object (which is f1's OST-obejct)."
2164 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2165 echo "dangling reference case, but f2's old OST-object is there."
2168 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2170 chown 1.1 $DIR/$tdir/a1/f2
2171 rm -f $DIR/$tdir/a1/f1
2174 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2176 echo "stopall to cleanup object cache"
2179 setupall > /dev/null
2181 echo "The file size should be incorrect since dangling referenced"
2182 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2183 [ "$cur_size" != "$saved_size" ] ||
2184 error "(1) Expect incorrect file2 size"
2186 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2187 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2189 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2190 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2192 wait_update_facet mds1 "$LCTL get_param -n \
2193 mdd.$(facet_svc mds1).lfsck_layout |
2194 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2195 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2197 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2199 for k in $(seq $MDSCOUNT); do
2200 # The LFSCK status query internal is 30 seconds. For the case
2201 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2202 # time to guarantee the status sync up.
2203 wait_update_facet mds${k} "$LCTL get_param -n \
2204 mdd.$(facet_svc mds${k}).lfsck_layout |
2205 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2206 error "(3) MDS${k} is not the expected 'completed'"
2209 for k in $(seq $OSTCOUNT); do
2210 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2211 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2212 awk '/^status/ { print $2 }')
2213 [ "$cur_status" == "completed" ] ||
2214 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2217 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2218 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2219 awk '/^repaired_orphan/ { print $2 }')
2220 [ $repaired -eq 1 ] ||
2221 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2223 echo "The file size should be correct after layout LFSCK scanning"
2224 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2225 [ "$cur_size" == "$saved_size" ] ||
2226 error "(6) Expect file2 size $saved_size, but got $cur_size"
2228 echo "The LFSCK should find back the original data."
2229 cat $DIR/$tdir/a1/f2
2230 $LFS path2fid $DIR/$tdir/a1/f2
2231 $LFS getstripe $DIR/$tdir/a1/f2
2233 run_test 18d "Find out orphan OST-object and repair it (4)"
2237 echo "The target MDT-object layout EA slot is occpuied by some new"
2238 echo "created OST-object when repair dangling reference case. Such"
2239 echo "conflict OST-object has been modified by others. To keep the"
2240 echo "new data, the LFSCK will create a new file to refernece this"
2241 echo "old orphan OST-object."
2244 check_mount_and_prep
2246 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2247 echo "guard" > $DIR/$tdir/a1/f1
2248 echo "foo" > $DIR/$tdir/a1/f2
2249 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2250 $LFS path2fid $DIR/$tdir/a1/f1
2251 $LFS getstripe $DIR/$tdir/a1/f1
2252 $LFS path2fid $DIR/$tdir/a1/f2
2253 $LFS getstripe $DIR/$tdir/a1/f2
2254 cancel_lru_locks osc
2256 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2257 echo "to reference the same OST-object (which is f1's OST-obejct)."
2258 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2259 echo "dangling reference case, but f2's old OST-object is there."
2262 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2264 chown 1.1 $DIR/$tdir/a1/f2
2265 rm -f $DIR/$tdir/a1/f1
2268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2270 echo "stopall to cleanup object cache"
2273 setupall > /dev/null
2275 echo "The file size should be incorrect since dangling referenced"
2276 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2277 [ "$cur_size" != "$saved_size" ] ||
2278 error "(1) Expect incorrect file2 size"
2280 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2281 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2283 start_full_debug_logging
2285 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2286 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2288 wait_update_facet mds1 "$LCTL get_param -n \
2289 mdd.$(facet_svc mds1).lfsck_layout |
2290 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2291 error "(3) MDS1 is not the expected 'scanning-phase2'"
2293 # to guarantee all updates are synced.
2297 echo "Write new data to f2 to modify the new created OST-object."
2298 echo "dummy" >> $DIR/$tdir/a1/f2
2300 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2302 for k in $(seq $MDSCOUNT); do
2303 # The LFSCK status query internal is 30 seconds. For the case
2304 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2305 # time to guarantee the status sync up.
2306 wait_update_facet mds${k} "$LCTL get_param -n \
2307 mdd.$(facet_svc mds${k}).lfsck_layout |
2308 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2309 error "(4) MDS${k} is not the expected 'completed'"
2312 for k in $(seq $OSTCOUNT); do
2313 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2314 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2315 awk '/^status/ { print $2 }')
2316 [ "$cur_status" == "completed" ] ||
2317 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2320 stop_full_debug_logging
2322 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2323 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2324 awk '/^repaired_orphan/ { print $2 }')
2325 [ $repaired -eq 1 ] ||
2326 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2328 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2329 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2330 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2332 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2333 [ ! -z "$cname" ] ||
2334 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2336 echo "The stub file should keep the original f2 data"
2337 cur_size=$(ls -il $cname | awk '{ print $6 }')
2338 [ "$cur_size" == "$saved_size" ] ||
2339 error "(9) Expect file2 size $saved_size, but got $cur_size"
2342 $LFS path2fid $cname
2343 $LFS getstripe $cname
2345 echo "The f2 should contains new data."
2346 cat $DIR/$tdir/a1/f2
2347 $LFS path2fid $DIR/$tdir/a1/f2
2348 $LFS getstripe $DIR/$tdir/a1/f2
2350 run_test 18e "Find out orphan OST-object and repair it (5)"
2353 [ $OSTCOUNT -lt 2 ] &&
2354 skip "The test needs at least 2 OSTs" && return
2357 echo "The target MDT-object is lost. The LFSCK should re-create the"
2358 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2359 echo "to verify some OST-object(s) during the first stage-scanning,"
2360 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2361 echo "should not be affected."
2364 check_mount_and_prep
2365 $LFS mkdir -i 0 $DIR/$tdir/a1
2366 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2367 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2368 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2369 $LFS mkdir -i 0 $DIR/$tdir/a2
2370 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2371 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2372 $LFS getstripe $DIR/$tdir/a1/f1
2373 $LFS getstripe $DIR/$tdir/a2/f2
2375 if [ $MDSCOUNT -ge 2 ]; then
2376 $LFS mkdir -i 1 $DIR/$tdir/a3
2377 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2378 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2379 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2380 $LFS mkdir -i 1 $DIR/$tdir/a4
2381 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2382 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2383 $LFS getstripe $DIR/$tdir/a3/f3
2384 $LFS getstripe $DIR/$tdir/a4/f4
2387 cancel_lru_locks osc
2389 echo "Inject failure, to simulate the case of missing the MDT-object"
2390 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2391 do_facet mds1 $LCTL set_param fail_loc=0x1616
2392 rm -f $DIR/$tdir/a1/f1
2393 rm -f $DIR/$tdir/a2/f2
2395 if [ $MDSCOUNT -ge 2 ]; then
2396 do_facet mds2 $LCTL set_param fail_loc=0x1616
2397 rm -f $DIR/$tdir/a3/f3
2398 rm -f $DIR/$tdir/a4/f4
2404 do_facet mds1 $LCTL set_param fail_loc=0
2405 if [ $MDSCOUNT -ge 2 ]; then
2406 do_facet mds2 $LCTL set_param fail_loc=0
2409 cancel_lru_locks mdc
2410 cancel_lru_locks osc
2412 echo "Inject failure, to simulate the OST0 fail to handle"
2413 echo "MDT0 LFSCK request during the first-stage scanning."
2414 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2415 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2417 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2418 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2420 for k in $(seq $MDSCOUNT); do
2421 # The LFSCK status query internal is 30 seconds. For the case
2422 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2423 # time to guarantee the status sync up.
2424 wait_update_facet mds${k} "$LCTL get_param -n \
2425 mdd.$(facet_svc mds${k}).lfsck_layout |
2426 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2427 error "(2) MDS${k} is not the expected 'partial'"
2430 wait_update_facet ost1 "$LCTL get_param -n \
2431 obdfilter.$(facet_svc ost1).lfsck_layout |
2432 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2433 error "(3) OST1 is not the expected 'partial'"
2436 wait_update_facet ost2 "$LCTL get_param -n \
2437 obdfilter.$(facet_svc ost2).lfsck_layout |
2438 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2439 error "(4) OST2 is not the expected 'completed'"
2442 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2444 local repaired=$(do_facet mds1 $LCTL get_param -n \
2445 mdd.$(facet_svc mds1).lfsck_layout |
2446 awk '/^repaired_orphan/ { print $2 }')
2447 [ $repaired -eq 1 ] ||
2448 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2450 if [ $MDSCOUNT -ge 2 ]; then
2451 repaired=$(do_facet mds2 $LCTL get_param -n \
2452 mdd.$(facet_svc mds2).lfsck_layout |
2453 awk '/^repaired_orphan/ { print $2 }')
2454 [ $repaired -eq 1 ] ||
2455 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2458 echo "Trigger layout LFSCK on all devices again to cleanup"
2459 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2461 for k in $(seq $MDSCOUNT); do
2462 # The LFSCK status query internal is 30 seconds. For the case
2463 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2464 # time to guarantee the status sync up.
2465 wait_update_facet mds${k} "$LCTL get_param -n \
2466 mdd.$(facet_svc mds${k}).lfsck_layout |
2467 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2468 error "(8) MDS${k} is not the expected 'completed'"
2471 for k in $(seq $OSTCOUNT); do
2472 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2473 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2474 awk '/^status/ { print $2 }')
2475 [ "$cur_status" == "completed" ] ||
2476 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2480 local repaired=$(do_facet mds1 $LCTL get_param -n \
2481 mdd.$(facet_svc mds1).lfsck_layout |
2482 awk '/^repaired_orphan/ { print $2 }')
2483 [ $repaired -eq 2 ] ||
2484 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2486 if [ $MDSCOUNT -ge 2 ]; then
2487 repaired=$(do_facet mds2 $LCTL get_param -n \
2488 mdd.$(facet_svc mds2).lfsck_layout |
2489 awk '/^repaired_orphan/ { print $2 }')
2490 [ $repaired -eq 2 ] ||
2491 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2494 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2498 echo "The target MDT-object is lost, but related OI mapping is there"
2499 echo "The LFSCK should recreate the lost MDT-object without affected"
2500 echo "by the stale OI mapping."
2503 check_mount_and_prep
2504 $LFS mkdir -i 0 $DIR/$tdir/a1
2505 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2506 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2507 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2509 $LFS getstripe $DIR/$tdir/a1/f1
2510 cancel_lru_locks osc
2512 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2513 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2514 do_facet mds1 $LCTL set_param fail_loc=0x162e
2515 rm -f $DIR/$tdir/a1/f1
2517 do_facet mds1 $LCTL set_param fail_loc=0
2518 cancel_lru_locks mdc
2519 cancel_lru_locks osc
2521 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2522 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2524 for k in $(seq $MDSCOUNT); do
2525 # The LFSCK status query internal is 30 seconds. For the case
2526 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2527 # time to guarantee the status sync up.
2528 wait_update_facet mds${k} "$LCTL get_param -n \
2529 mdd.$(facet_svc mds${k}).lfsck_layout |
2530 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2531 error "(2) MDS${k} is not the expected 'completed'"
2534 for k in $(seq $OSTCOUNT); do
2535 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2536 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2537 awk '/^status/ { print $2 }')
2538 [ "$cur_status" == "completed" ] ||
2539 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2542 local repaired=$(do_facet mds1 $LCTL get_param -n \
2543 mdd.$(facet_svc mds1).lfsck_layout |
2544 awk '/^repaired_orphan/ { print $2 }')
2545 [ $repaired -eq $OSTCOUNT ] ||
2546 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2548 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2549 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2550 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2552 $LFS path2fid $DIR/$tdir/a1/f1
2553 $LFS getstripe $DIR/$tdir/a1/f1
2555 run_test 18g "Find out orphan OST-object and repair it (7)"
2557 $LCTL set_param debug=-cache > /dev/null
2560 check_mount_and_prep
2561 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2563 echo "foo" > $DIR/$tdir/a0
2564 echo "guard" > $DIR/$tdir/a1
2565 cancel_lru_locks osc
2567 echo "Inject failure, then client will offer wrong parent FID when read"
2568 do_facet ost1 $LCTL set_param -n \
2569 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2570 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2571 $LCTL set_param fail_loc=0x1619
2573 echo "Read RPC with wrong parent FID should be denied"
2574 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2575 $LCTL set_param fail_loc=0
2577 run_test 19a "OST-object inconsistency self detect"
2580 check_mount_and_prep
2581 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2583 echo "Inject failure stub to make the OST-object to back point to"
2584 echo "non-exist MDT-object"
2586 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2587 do_facet ost1 $LCTL set_param fail_loc=0x1611
2588 echo "foo" > $DIR/$tdir/f0
2589 cancel_lru_locks osc
2590 do_facet ost1 $LCTL set_param fail_loc=0
2592 echo "Nothing should be fixed since self detect and repair is disabled"
2593 local repaired=$(do_facet ost1 $LCTL get_param -n \
2594 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2595 awk '/^repaired/ { print $2 }')
2596 [ $repaired -eq 0 ] ||
2597 error "(1) Expected 0 repaired, but got $repaired"
2599 echo "Read RPC with right parent FID should be accepted,"
2600 echo "and cause parent FID on OST to be fixed"
2602 do_facet ost1 $LCTL set_param -n \
2603 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2604 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2606 repaired=$(do_facet ost1 $LCTL get_param -n \
2607 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2608 awk '/^repaired/ { print $2 }')
2609 [ $repaired -eq 1 ] ||
2610 error "(3) Expected 1 repaired, but got $repaired"
2612 run_test 19b "OST-object inconsistency self repair"
2615 [ $OSTCOUNT -lt 2 ] &&
2616 skip "The test needs at least 2 OSTs" && return
2619 echo "The target MDT-object and some of its OST-object are lost."
2620 echo "The LFSCK should find out the left OST-objects and re-create"
2621 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2622 echo "with the partial OST-objects (LOV EA hole)."
2624 echo "New client can access the file with LOV EA hole via normal"
2625 echo "system tools or commands without crash the system."
2627 echo "For old client, even though it cannot access the file with"
2628 echo "LOV EA hole, it should not cause the system crash."
2631 check_mount_and_prep
2632 $LFS mkdir -i 0 $DIR/$tdir/a1
2633 if [ $OSTCOUNT -gt 2 ]; then
2634 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2637 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2641 # 256 blocks on the stripe0.
2642 # 1 block on the stripe1 for 2 OSTs case.
2643 # 256 blocks on the stripe1 for other cases.
2644 # 1 block on the stripe2 if OSTs > 2
2645 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2646 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2647 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2649 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2650 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2651 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2654 $LFS getstripe $DIR/$tdir/a1/f0
2656 $LFS getstripe $DIR/$tdir/a1/f1
2658 $LFS getstripe $DIR/$tdir/a1/f2
2660 if [ $OSTCOUNT -gt 2 ]; then
2661 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2662 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2664 $LFS getstripe $DIR/$tdir/a1/f3
2667 cancel_lru_locks osc
2669 echo "Inject failure..."
2670 echo "To simulate f0 lost MDT-object"
2671 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2672 do_facet mds1 $LCTL set_param fail_loc=0x1616
2673 rm -f $DIR/$tdir/a1/f0
2675 echo "To simulate f1 lost MDT-object and OST-object0"
2676 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2677 do_facet mds1 $LCTL set_param fail_loc=0x161a
2678 rm -f $DIR/$tdir/a1/f1
2680 echo "To simulate f2 lost MDT-object and OST-object1"
2681 do_facet mds1 $LCTL set_param fail_val=1
2682 rm -f $DIR/$tdir/a1/f2
2684 if [ $OSTCOUNT -gt 2 ]; then
2685 echo "To simulate f3 lost MDT-object and OST-object2"
2686 do_facet mds1 $LCTL set_param fail_val=2
2687 rm -f $DIR/$tdir/a1/f3
2690 umount_client $MOUNT
2693 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2695 echo "Inject failure to slow down the LFSCK on OST0"
2696 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2697 do_facet ost1 $LCTL set_param fail_loc=0x161b
2699 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2700 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2703 do_facet ost1 $LCTL set_param fail_loc=0
2705 for k in $(seq $MDSCOUNT); do
2706 # The LFSCK status query internal is 30 seconds. For the case
2707 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2708 # time to guarantee the status sync up.
2709 wait_update_facet mds${k} "$LCTL get_param -n \
2710 mdd.$(facet_svc mds${k}).lfsck_layout |
2711 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2712 error "(2) MDS${k} is not the expected 'completed'"
2715 for k in $(seq $OSTCOUNT); do
2716 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2717 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2718 awk '/^status/ { print $2 }')
2719 [ "$cur_status" == "completed" ] ||
2720 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2723 local repaired=$(do_facet mds1 $LCTL get_param -n \
2724 mdd.$(facet_svc mds1).lfsck_layout |
2725 awk '/^repaired_orphan/ { print $2 }')
2726 if [ $OSTCOUNT -gt 2 ]; then
2727 [ $repaired -eq 9 ] ||
2728 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2730 [ $repaired -eq 4 ] ||
2731 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2734 mount_client $MOUNT || error "(5.0) Fail to start client!"
2736 LOV_PATTERN_F_HOLE=0x40000000
2739 # ${fid0}-R-0 is the old f0
2741 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2742 echo "Check $name, which is the old f0"
2744 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2746 local pattern=0x$($LFS getstripe -L $name)
2747 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2748 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2750 local stripes=$($LFS getstripe -c $name)
2751 if [ $OSTCOUNT -gt 2 ]; then
2752 [ $stripes -eq 3 ] ||
2753 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2755 [ $stripes -eq 2 ] ||
2756 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2759 local size=$(stat $name | awk '/Size:/ { print $2 }')
2760 [ $size -eq $((4096 * $bcount)) ] ||
2761 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2763 cat $name > /dev/null || error "(5.5) cannot read $name"
2765 echo "dummy" >> $name || error "(5.6) cannot write $name"
2767 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2769 touch $name || error "(5.8) cannot touch $name"
2771 rm -f $name || error "(5.9) cannot unlink $name"
2774 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2776 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2777 if [ $OSTCOUNT -gt 2 ]; then
2778 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2780 echo "Check $name, it contains the old f1's stripe1"
2783 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2785 pattern=0x$($LFS getstripe -L $name)
2786 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2787 error "(6.2) expect pattern flag hole, but got $pattern"
2789 stripes=$($LFS getstripe -c $name)
2790 if [ $OSTCOUNT -gt 2 ]; then
2791 [ $stripes -eq 3 ] ||
2792 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2794 [ $stripes -eq 2 ] ||
2795 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2798 size=$(stat $name | awk '/Size:/ { print $2 }')
2799 [ $size -eq $((4096 * $bcount)) ] ||
2800 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2802 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2804 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2805 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2808 [ $failures -eq 256 ] ||
2809 error "(6.6) expect 256 IO failures, but get $failures"
2811 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2812 [ $size -eq $((4096 * $bcount)) ] ||
2813 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2815 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2816 error "(6.8) write to the LOV EA hole should fail"
2818 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2819 error "(6.9) write to normal stripe should NOT fail"
2821 echo "foo" >> $name && error "(6.10) append write $name should fail"
2823 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2825 touch $name || error "(6.12) cannot touch $name"
2827 rm -f $name || error "(6.13) cannot unlink $name"
2830 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2832 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2833 if [ $OSTCOUNT -gt 2 ]; then
2834 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2836 echo "Check $name, it contains the old f2's stripe0"
2839 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2841 pattern=0x$($LFS getstripe -L $name)
2842 stripes=$($LFS getstripe -c $name)
2843 size=$(stat $name | awk '/Size:/ { print $2 }')
2844 if [ $OSTCOUNT -gt 2 ]; then
2845 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2846 error "(7.2.1) expect pattern flag hole, but got $pattern"
2848 [ $stripes -eq 3 ] ||
2849 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2851 [ $size -eq $((4096 * $bcount)) ] ||
2852 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2854 cat $name > /dev/null &&
2855 error "(7.5.1) normal read $name should fail"
2857 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2858 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2860 [ $failures -eq 256 ] ||
2861 error "(7.6) expect 256 IO failures, but get $failures"
2863 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2864 [ $size -eq $((4096 * $bcount)) ] ||
2865 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2867 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2868 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2870 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2871 error "(7.8.1) write to normal stripe should NOT fail"
2873 echo "foo" >> $name &&
2874 error "(7.8.3) append write $name should fail"
2876 chown $RUNAS_ID:$RUNAS_GID $name ||
2877 error "(7.9.1) cannot chown on $name"
2879 touch $name || error "(7.10.1) cannot touch $name"
2881 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2882 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2884 [ $stripes -eq 1 ] ||
2885 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2888 [ $size -eq $((4096 * (256 + 0))) ] ||
2889 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2891 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2893 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2895 chown $RUNAS_ID:$RUNAS_GID $name ||
2896 error "(7.9.2) cannot chown on $name"
2898 touch $name || error "(7.10.2) cannot touch $name"
2901 rm -f $name || error "(7.11) cannot unlink $name"
2903 [ $OSTCOUNT -le 2 ] && return
2906 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2908 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2909 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2911 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2913 pattern=0x$($LFS getstripe -L $name)
2914 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2915 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2917 stripes=$($LFS getstripe -c $name)
2918 # LFSCK does not know the old f3 had 3 stripes.
2919 # It only tries to find as much as possible.
2920 # The stripe count depends on the last stripe's offset.
2921 [ $stripes -eq 2 ] ||
2922 error "(8.3) expect the stripe count is 2, but got $stripes"
2924 size=$(stat $name | awk '/Size:/ { print $2 }')
2926 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2927 error "(8.4) expect the size $((4096 * 512)), but got $size"
2929 cat $name > /dev/null || error "(8.5) cannot read $name"
2931 echo "dummy" >> $name || error "(8.6) cannot write $name"
2933 chown $RUNAS_ID:$RUNAS_GID $name ||
2934 error "(8.7) cannot chown on $name"
2936 touch $name || error "(8.8) cannot touch $name"
2938 rm -f $name || error "(8.9) cannot unlink $name"
2940 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2943 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2944 skip "ignore the test if MDS is older than 2.5.59" && return
2946 check_mount_and_prep
2947 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2949 echo "Start all LFSCK components by default (-s 1)"
2950 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2951 error "Fail to start LFSCK"
2953 echo "namespace LFSCK should be in 'scanning-phase1' status"
2954 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2955 [ "$STATUS" == "scanning-phase1" ] ||
2956 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2958 echo "layout LFSCK should be in 'scanning-phase1' status"
2959 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2960 [ "$STATUS" == "scanning-phase1" ] ||
2961 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2963 echo "Stop all LFSCK components by default"
2964 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2965 error "Fail to stop LFSCK"
2967 run_test 21 "run all LFSCK components by default"
2970 [ $MDSCOUNT -lt 2 ] &&
2971 skip "We need at least 2 MDSes for this test" && return
2974 echo "The parent_A references the child directory via some name entry,"
2975 echo "but the child directory back references another parent_B via its"
2976 echo "".." name entry. The parent_B does not exist. Then the namespace"
2977 echo "LFSCK will repair the child directory's ".." name entry."
2980 check_mount_and_prep
2982 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2983 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2985 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2986 echo "The dummy's dotdot name entry references the guard."
2987 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2988 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2989 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2990 error "(3) Fail to mkdir on MDT0"
2991 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2993 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2995 echo "Trigger namespace LFSCK to repair unmatched pairs"
2996 $START_NAMESPACE -A -r ||
2997 error "(5) Fail to start LFSCK for namespace"
2999 wait_all_targets_blocked namespace completed 6
3001 local repaired=$($SHOW_NAMESPACE |
3002 awk '/^unmatched_pairs_repaired/ { print $2 }')
3003 [ $repaired -eq 1 ] ||
3004 error "(7) Fail to repair unmatched pairs: $repaired"
3006 echo "'ls' should success after namespace LFSCK repairing"
3007 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3008 error "(8) ls should success."
3010 run_test 22a "LFSCK can repair unmatched pairs (1)"
3013 [ $MDSCOUNT -lt 2 ] &&
3014 skip "We need at least 2 MDSes for this test" && return
3017 echo "The parent_A references the child directory via the name entry_B,"
3018 echo "but the child directory back references another parent_C via its"
3019 echo "".." name entry. The parent_C exists, but there is no the name"
3020 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3021 echo "the child directory's ".." name entry and its linkEA."
3024 check_mount_and_prep
3026 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3027 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3029 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3030 echo "and bad linkEA. The dummy's dotdot name entry references the"
3031 echo "guard. The dummy's linkEA references n non-exist name entry."
3032 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3033 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3034 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3035 error "(3) Fail to mkdir on MDT0"
3036 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3038 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3039 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3040 local dummyname=$($LFS fid2path $DIR $dummyfid)
3041 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3042 error "(4) fid2path works unexpectedly."
3044 echo "Trigger namespace LFSCK to repair unmatched pairs"
3045 $START_NAMESPACE -A -r ||
3046 error "(5) Fail to start LFSCK for namespace"
3048 wait_all_targets_blocked namespace completed 6
3050 local repaired=$($SHOW_NAMESPACE |
3051 awk '/^unmatched_pairs_repaired/ { print $2 }')
3052 [ $repaired -eq 1 ] ||
3053 error "(7) Fail to repair unmatched pairs: $repaired"
3055 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3056 local dummyname=$($LFS fid2path $DIR $dummyfid)
3057 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3058 error "(8) fid2path does not work"
3060 run_test 22b "LFSCK can repair unmatched pairs (2)"
3063 [ $MDSCOUNT -lt 2 ] &&
3064 skip "We need at least 2 MDSes for this test" && return
3067 echo "The name entry is there, but the MDT-object for such name "
3068 echo "entry does not exist. The namespace LFSCK should find out "
3069 echo "and repair the inconsistency as required."
3072 check_mount_and_prep
3074 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3075 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3077 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3078 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3079 do_facet mds2 $LCTL set_param fail_loc=0x1620
3080 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3081 do_facet mds2 $LCTL set_param fail_loc=0
3083 echo "'ls' should fail because of dangling name entry"
3084 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3086 echo "Trigger namespace LFSCK to find out dangling name entry"
3087 $START_NAMESPACE -A -r ||
3088 error "(5) Fail to start LFSCK for namespace"
3090 wait_all_targets_blocked namespace completed 6
3092 local repaired=$($SHOW_NAMESPACE |
3093 awk '/^dangling_repaired/ { print $2 }')
3094 [ $repaired -eq 1 ] ||
3095 error "(7) Fail to repair dangling name entry: $repaired"
3097 echo "'ls' should fail because not re-create MDT-object by default"
3098 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3100 echo "Trigger namespace LFSCK again to repair dangling name entry"
3101 $START_NAMESPACE -A -r -C ||
3102 error "(9) Fail to start LFSCK for namespace"
3104 wait_all_targets_blocked namespace completed 10
3106 repaired=$($SHOW_NAMESPACE |
3107 awk '/^dangling_repaired/ { print $2 }')
3108 [ $repaired -eq 1 ] ||
3109 error "(11) Fail to repair dangling name entry: $repaired"
3111 echo "'ls' should success after namespace LFSCK repairing"
3112 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3114 run_test 23a "LFSCK can repair dangling name entry (1)"
3118 echo "The objectA has multiple hard links, one of them corresponding"
3119 echo "to the name entry_B. But there is something wrong for the name"
3120 echo "entry_B and cause entry_B to references non-exist object_C."
3121 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3122 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3123 echo "comes to the second-stage scanning, it will find that the"
3124 echo "former re-creating object_C is not proper, and will try to"
3125 echo "replace the object_C with the real object_A."
3128 check_mount_and_prep
3130 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3131 # Trigger LFSCK firstly, that will generate the
3132 # .lustre/lost+found/MDTxxxx in advance to avoid
3133 # reusing the local object for the dangling name
3135 $START_NAMESPACE -r ||
3136 error "(0) Fail to start LFSCK for namespace"
3138 wait_all_targets_blocked namespace completed 0.1
3141 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3142 $LFS path2fid $DIR/$tdir/d0
3144 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3145 $LFS path2fid $DIR/$tdir/d0/f0
3147 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3148 $LFS path2fid $DIR/$tdir/d0/f1
3150 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3151 OID=$(printf %d $OID)
3153 if [ $OID -eq 1 ]; then
3154 # To guarantee that the f0 and f1 are in the same FID seq
3155 rm -f $DIR/$tdir/d0/f0 ||
3156 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3157 echo "dummy" > $DIR/$tdir/d0/f0 ||
3158 error "(3.2) Fail to touch on MDT0"
3159 $LFS path2fid $DIR/$tdir/d0/f0
3162 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3163 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3164 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3165 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3166 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3168 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3170 echo "'ls' should fail because of dangling name entry"
3171 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3172 error "(6) ls should fail."
3174 echo "Trigger namespace LFSCK to find out dangling name entry"
3175 $START_NAMESPACE -r -C ||
3176 error "(7) Fail to start LFSCK for namespace"
3178 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3179 mdd.${MDT_DEV}.lfsck_namespace |
3180 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3182 error "(8) unexpected status"
3185 local repaired=$($SHOW_NAMESPACE |
3186 awk '/^dangling_repaired/ { print $2 }')
3187 [ $repaired -eq 1 ] ||
3188 error "(9) Fail to repair dangling name entry: $repaired"
3190 repaired=$($SHOW_NAMESPACE |
3191 awk '/^multiple_linked_repaired/ { print $2 }')
3192 [ $repaired -eq 1 ] ||
3193 error "(10) Fail to drop the former created object: $repaired"
3195 local data=$(cat $DIR/$tdir/d0/foo)
3196 [ "$data" == "dummy" ] ||
3197 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3199 run_test 23b "LFSCK can repair dangling name entry (2)"
3203 echo "The objectA has multiple hard links, one of them corresponding"
3204 echo "to the name entry_B. But there is something wrong for the name"
3205 echo "entry_B and cause entry_B to references non-exist object_C."
3206 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3207 echo "as dangling, and re-create the lost object_C. And then others"
3208 echo "modified the re-created object_C. When the LFSCK comes to the"
3209 echo "second-stage scanning, it will find that the former re-creating"
3210 echo "object_C maybe wrong and try to replace the object_C with the"
3211 echo "real object_A. But because object_C has been modified, so the"
3212 echo "LFSCK cannot replace it."
3215 start_full_debug_logging
3217 check_mount_and_prep
3219 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3220 # Trigger LFSCK firstly, that will generate the
3221 # .lustre/lost+found/MDTxxxx in advance to avoid
3222 # reusing the local object for the dangling name
3224 $START_NAMESPACE -r ||
3225 error "(0) Fail to start LFSCK for namespace"
3227 wait_all_targets_blocked namespace completed 0.1
3230 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3231 $LFS path2fid $DIR/$tdir/d0
3233 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3234 $LFS path2fid $DIR/$tdir/d0/f0
3236 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3237 $LFS path2fid $DIR/$tdir/d0/f1
3239 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3240 OID=$(printf %d $OID)
3242 if [ $OID -eq 1 ]; then
3243 # To guarantee that the f0 and f1 are in the same FID seq
3244 rm -f $DIR/$tdir/d0/f0 ||
3245 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3246 echo "dummy" > $DIR/$tdir/d0/f0 ||
3247 error "(3.2) Fail to touch on MDT0"
3248 $LFS path2fid $DIR/$tdir/d0/f0
3251 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3252 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3253 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3254 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3255 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3257 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3259 echo "'ls' should fail because of dangling name entry"
3260 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3261 error "(6) ls should fail."
3263 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3264 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3266 echo "Trigger namespace LFSCK to find out dangling name entry"
3267 $START_NAMESPACE -r -C ||
3268 error "(7) Fail to start LFSCK for namespace"
3270 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3271 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3272 stat $DIR/$tdir/guard
3274 error "(8) unexpected size"
3277 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3278 cancel_lru_locks osc
3280 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3281 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3282 mdd.${MDT_DEV}.lfsck_namespace |
3283 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3285 error "(10) unexpected status"
3288 stop_full_debug_logging
3290 local repaired=$($SHOW_NAMESPACE |
3291 awk '/^dangling_repaired/ { print $2 }')
3292 [ $repaired -eq 1 ] ||
3293 error "(11) Fail to repair dangling name entry: $repaired"
3295 local data=$(cat $DIR/$tdir/d0/foo)
3296 [ "$data" != "dummy" ] ||
3297 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3299 run_test 23c "LFSCK can repair dangling name entry (3)"
3302 [ $MDSCOUNT -lt 2 ] &&
3303 skip "We need at least 2 MDSes for this test" && return
3306 echo "Two MDT-objects back reference the same name entry via their"
3307 echo "each own linkEA entry, but the name entry only references one"
3308 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3309 echo "for the MDT-object that is not recognized. If such MDT-object"
3310 echo "has no other linkEA entry after the removing, then the LFSCK"
3311 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3314 check_mount_and_prep
3316 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3318 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3319 $LFS path2fid $DIR/$tdir/d0/guard
3321 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3322 $LFS path2fid $DIR/$tdir/d0/dummy
3325 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3326 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3328 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3331 touch $DIR/$tdir/d0/guard/foo ||
3332 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3334 echo "Inject failure stub on MDT0 to simulate the case that"
3335 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3336 echo "that references $DIR/$tdir/d0/guard/foo."
3337 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3338 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3339 echo "there with the same linkEA entry as another MDT-object"
3340 echo "$DIR/$tdir/d0/guard/foo has"
3342 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3343 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3344 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3345 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3346 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3347 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3348 rmdir $DIR/$tdir/d0/dummy/foo ||
3349 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3350 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3352 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3353 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3354 error "(6) stat successfully unexpectedly"
3356 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3357 $START_NAMESPACE -A -r ||
3358 error "(7) Fail to start LFSCK for namespace"
3360 wait_all_targets_blocked namespace completed 8
3362 local repaired=$($SHOW_NAMESPACE |
3363 awk '/^multiple_referenced_repaired/ { print $2 }')
3364 [ $repaired -eq 1 ] ||
3365 error "(9) Fail to repair multiple referenced name entry: $repaired"
3367 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3368 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3369 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3371 local cname="$cfid-$pfid-D-0"
3372 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3373 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3375 run_test 24 "LFSCK can repair multiple-referenced name entry"
3378 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3379 skip "Only support to inject failure on ldiskfs" && return
3382 echo "The file type in the name entry does not match the file type"
3383 echo "claimed by the referenced object. Then the LFSCK will update"
3384 echo "the file type in the name entry."
3387 check_mount_and_prep
3389 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3391 echo "Inject failure stub on MDT0 to simulate the case that"
3392 echo "the file type stored in the name entry is wrong."
3394 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3396 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3397 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3399 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3400 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3402 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3403 mdd.${MDT_DEV}.lfsck_namespace |
3404 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3406 error "(4) unexpected status"
3409 local repaired=$($SHOW_NAMESPACE |
3410 awk '/^bad_file_type_repaired/ { print $2 }')
3411 [ $repaired -eq 1 ] ||
3412 error "(5) Fail to repair bad file type in name entry: $repaired"
3414 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3416 run_test 25 "LFSCK can repair bad file type in the name entry"
3420 echo "The local name entry back referenced by the MDT-object is lost."
3421 echo "The namespace LFSCK will add the missing local name entry back"
3422 echo "to the normal namespace."
3425 check_mount_and_prep
3427 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3428 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3429 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3431 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3432 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3434 echo "Inject failure stub on MDT0 to simulate the case that"
3435 echo "foo's name entry will be removed, but the foo's object"
3436 echo "and its linkEA are kept in the system."
3438 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3440 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3441 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3443 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3444 error "(5) 'ls' should fail"
3446 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3447 $START_NAMESPACE -r -A ||
3448 error "(6) Fail to start LFSCK for namespace"
3450 wait_all_targets_blocked namespace completed 7
3452 local repaired=$($SHOW_NAMESPACE |
3453 awk '/^lost_dirent_repaired/ { print $2 }')
3454 [ $repaired -eq 1 ] ||
3455 error "(8) Fail to repair lost dirent: $repaired"
3457 ls -ail $DIR/$tdir/d0/foo ||
3458 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3460 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3461 [ "$foofid" == "$foofid2" ] ||
3462 error "(10) foo's FID changed: $foofid, $foofid2"
3464 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3467 [ $MDSCOUNT -lt 2 ] &&
3468 skip "We need at least 2 MDSes for this test" && return
3471 echo "The remote name entry back referenced by the MDT-object is lost."
3472 echo "The namespace LFSCK will add the missing remote name entry back"
3473 echo "to the normal namespace."
3476 check_mount_and_prep
3478 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3479 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3480 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3482 echo "Inject failure stub on MDT0 to simulate the case that"
3483 echo "foo's name entry will be removed, but the foo's object"
3484 echo "and its linkEA are kept in the system."
3486 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3487 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3488 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3489 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3491 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3492 error "(4) 'ls' should fail"
3494 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3495 $START_NAMESPACE -r -A ||
3496 error "(5) Fail to start LFSCK for namespace"
3498 wait_all_targets_blocked namespace completed 6
3500 local repaired=$($SHOW_NAMESPACE |
3501 awk '/^lost_dirent_repaired/ { print $2 }')
3502 [ $repaired -eq 1 ] ||
3503 error "(7) Fail to repair lost dirent: $repaired"
3505 ls -ail $DIR/$tdir/d0/foo ||
3506 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3508 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3509 [ "$foofid" == "$foofid2" ] ||
3510 error "(9) foo's FID changed: $foofid, $foofid2"
3512 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3516 echo "The local parent referenced by the MDT-object linkEA is lost."
3517 echo "The namespace LFSCK will re-create the lost parent as orphan."
3520 check_mount_and_prep
3522 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3523 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3524 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3525 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3527 echo "Inject failure stub on MDT0 to simulate the case that"
3528 echo "foo's name entry will be removed, but the foo's object"
3529 echo "and its linkEA are kept in the system. And then remove"
3530 echo "another hard link and the parent directory."
3532 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3534 rm -f $DIR/$tdir/d0/foo ||
3535 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3536 rm -f $DIR/$tdir/d0/dummy ||
3537 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3540 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3541 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
3543 echo "Trigger namespace LFSCK to repair the lost parent"
3544 $START_NAMESPACE -r -A ||
3545 error "(6) Fail to start LFSCK for namespace"
3547 wait_all_targets_blocked namespace completed 7
3549 local repaired=$($SHOW_NAMESPACE |
3550 awk '/^lost_dirent_repaired/ { print $2 }')
3551 [ $repaired -eq 1 ] ||
3552 error "(8) Fail to repair lost dirent: $repaired"
3554 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3555 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3556 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3558 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3560 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3561 [ ! -z "$cname" ] ||
3562 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3564 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3567 [ $MDSCOUNT -lt 2 ] &&
3568 skip "We need at least 2 MDSes for this test" && return
3571 echo "The remote parent referenced by the MDT-object linkEA is lost."
3572 echo "The namespace LFSCK will re-create the lost parent as orphan."
3575 check_mount_and_prep
3577 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3578 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3580 $LFS path2fid $DIR/$tdir/d0
3582 echo "Inject failure stub on MDT0 to simulate the case that"
3583 echo "foo's name entry will be removed, but the foo's object"
3584 echo "and its linkEA are kept in the system. And then remove"
3585 echo "the parent directory."
3587 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3588 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3589 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3590 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3592 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3593 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
3595 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3596 $START_NAMESPACE -r -A ||
3597 error "(6) Fail to start LFSCK for namespace"
3599 wait_all_targets_blocked namespace completed 7
3601 local repaired=$($SHOW_NAMESPACE |
3602 awk '/^lost_dirent_repaired/ { print $2 }')
3603 [ $repaired -eq 1 ] ||
3604 error "(8) Fail to repair lost dirent: $repaired"
3606 ls -ail $MOUNT/.lustre/lost+found/
3608 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3609 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3610 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3612 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3614 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3615 [ ! -z "$cname" ] ||
3616 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3618 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3621 [ $MDSCOUNT -lt 2 ] &&
3622 skip "The test needs at least 2 MDTs" && return
3625 echo "The target name entry is lost. The LFSCK should insert the"
3626 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3627 echo "the MDT (on which the orphan MDT-object resides) has ever"
3628 echo "failed to respond some name entry verification during the"
3629 echo "first stage-scanning, then the LFSCK should skip to handle"
3630 echo "orphan MDT-object on this MDT. But other MDTs should not"
3634 check_mount_and_prep
3635 $LFS mkdir -i 0 $DIR/$tdir/d1
3636 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3637 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3639 $LFS mkdir -i 1 $DIR/$tdir/d2
3640 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3641 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3643 echo "Inject failure stub on MDT0 to simulate the case that"
3644 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3645 echo "and its linkEA are kept in the system. And the case that"
3646 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3647 echo "and its linkEA are kept in the system."
3649 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3650 do_facet mds1 $LCTL set_param fail_loc=0x1624
3651 do_facet mds2 $LCTL set_param fail_loc=0x1624
3652 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3653 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3654 do_facet mds1 $LCTL set_param fail_loc=0
3655 do_facet mds2 $LCTL set_param fail_loc=0
3657 cancel_lru_locks mdc
3658 cancel_lru_locks osc
3660 echo "Inject failure, to simulate the MDT0 fail to handle"
3661 echo "MDT1 LFSCK request during the first-stage scanning."
3662 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3663 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3665 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3666 $START_NAMESPACE -r -A ||
3667 error "(3) Fail to start LFSCK for namespace"
3669 wait_update_facet mds1 "$LCTL get_param -n \
3670 mdd.$(facet_svc mds1).lfsck_namespace |
3671 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3672 error "(4) mds1 is not the expected 'partial'"
3675 wait_update_facet mds2 "$LCTL get_param -n \
3676 mdd.$(facet_svc mds2).lfsck_namespace |
3677 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3678 error "(5) mds2 is not the expected 'completed'"
3681 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3683 local repaired=$(do_facet mds1 $LCTL get_param -n \
3684 mdd.$(facet_svc mds1).lfsck_namespace |
3685 awk '/^lost_dirent_repaired/ { print $2 }')
3686 [ $repaired -eq 0 ] ||
3687 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3689 repaired=$(do_facet mds2 $LCTL get_param -n \
3690 mdd.$(facet_svc mds2).lfsck_namespace |
3691 awk '/^lost_dirent_repaired/ { print $2 }')
3692 [ $repaired -eq 1 ] ||
3693 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3695 echo "Trigger namespace LFSCK on all devices again to cleanup"
3696 $START_NAMESPACE -r -A ||
3697 error "(8) Fail to start LFSCK for namespace"
3699 wait_all_targets_blocked namespace completed 9
3701 local repaired=$(do_facet mds1 $LCTL get_param -n \
3702 mdd.$(facet_svc mds1).lfsck_namespace |
3703 awk '/^lost_dirent_repaired/ { print $2 }')
3704 [ $repaired -eq 1 ] ||
3705 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3707 repaired=$(do_facet mds2 $LCTL get_param -n \
3708 mdd.$(facet_svc mds2).lfsck_namespace |
3709 awk '/^lost_dirent_repaired/ { print $2 }')
3710 [ $repaired -eq 0 ] ||
3711 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3713 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3717 echo "The object's nlink attribute is larger than the object's known"
3718 echo "name entries count. The LFSCK will repair the object's nlink"
3719 echo "attribute to match the known name entries count"
3722 check_mount_and_prep
3724 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3725 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3727 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3728 echo "nlink attribute is larger than its name entries count."
3730 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3731 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3732 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3733 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3734 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3736 cancel_lru_locks mdc
3737 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3738 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3740 echo "Trigger namespace LFSCK to repair the nlink count"
3741 $START_NAMESPACE -r -A ||
3742 error "(5) Fail to start LFSCK for namespace"
3744 wait_all_targets_blocked namespace completed 6
3746 local repaired=$($SHOW_NAMESPACE |
3747 awk '/^nlinks_repaired/ { print $2 }')
3748 [ $repaired -eq 1 ] ||
3749 error "(7) Fail to repair nlink count: $repaired"
3751 cancel_lru_locks mdc
3752 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3753 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3755 run_test 29a "LFSCK can repair bad nlink count (1)"
3759 echo "The object's nlink attribute is smaller than the object's known"
3760 echo "name entries count. The LFSCK will repair the object's nlink"
3761 echo "attribute to match the known name entries count"
3764 check_mount_and_prep
3766 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3767 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3769 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3770 echo "nlink attribute is smaller than its name entries count."
3772 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3773 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3774 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3775 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3776 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3778 cancel_lru_locks mdc
3779 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3780 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3782 echo "Trigger namespace LFSCK to repair the nlink count"
3783 $START_NAMESPACE -r -A ||
3784 error "(5) Fail to start LFSCK for namespace"
3786 wait_all_targets_blocked namespace completed 6
3788 local repaired=$($SHOW_NAMESPACE |
3789 awk '/^nlinks_repaired/ { print $2 }')
3790 [ $repaired -eq 1 ] ||
3791 error "(7) Fail to repair nlink count: $repaired"
3793 cancel_lru_locks mdc
3794 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3795 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3797 run_test 29b "LFSCK can repair bad nlink count (2)"
3801 echo "There are too many hard links to the object, and exceeds the"
3802 echo "object's linkEA limitation, as to NOT all the known name entries"
3803 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3804 echo "skip the nlink verification for this object."
3807 check_mount_and_prep
3809 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3810 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3811 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3812 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3814 echo "Inject failure stub on MDT0 to simulate the case that"
3815 echo "foo's hard links exceed the object's linkEA limitation."
3817 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3819 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3820 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3822 cancel_lru_locks mdc
3824 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3825 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3827 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3828 $LFS fid2path $DIR $foofid
3829 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3830 [ $count2 -eq 2 ] || error "(6) Fail to inject error: $count2"
3832 echo "Trigger namespace LFSCK to repair the nlink count"
3833 $START_NAMESPACE -r -A ||
3834 error "(7) Fail to start LFSCK for namespace"
3836 wait_all_targets_blocked namespace completed 8
3838 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3839 local repaired=$($SHOW_NAMESPACE |
3840 awk '/^nlinks_repaired/ { print $2 }')
3841 [ $repaired -eq 0 ] ||
3842 error "(9) Repair nlink count unexpcetedly: $repaired"
3844 cancel_lru_locks mdc
3846 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3847 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3849 count2=$($LFS fid2path $DIR $foofid | wc -l)
3850 [ $count2 -eq 2 ] ||
3851 error "(11) Repaired something unexpectedly: $count2"
3853 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3856 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3857 skip "Only support backend /lost+found for ldiskfs" && return
3860 echo "The namespace LFSCK will move the orphans from backend"
3861 echo "/lost+found directory to normal client visible namespace"
3862 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3865 check_mount_and_prep
3867 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3868 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3870 echo "Inject failure stub on MDT0 to simulate the case that"
3871 echo "directory d0 has no linkEA entry, then the LFSCK will"
3872 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3874 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3875 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3876 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3877 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3879 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3880 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3882 echo "Inject failure stub on MDT0 to simulate the case that the"
3883 echo "object's name entry will be removed, but not destroy the"
3884 echo "object. Then backend e2fsck will handle it as orphan and"
3885 echo "add them into the backend /lost+found directory."
3887 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3888 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3889 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3890 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3891 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3892 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3895 umount_client $MOUNT || error "(10) Fail to stop client!"
3897 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3900 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3901 error "(12) Fail to run e2fsck"
3903 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3904 error "(13) Fail to start MDT0"
3906 echo "Trigger namespace LFSCK to recover backend orphans"
3907 $START_NAMESPACE -r -A ||
3908 error "(14) Fail to start LFSCK for namespace"
3910 wait_all_targets_blocked namespace completed 15
3912 local repaired=$($SHOW_NAMESPACE |
3913 awk '/^local_lost_found_moved/ { print $2 }')
3914 [ $repaired -ge 4 ] ||
3915 error "(16) Fail to recover backend orphans: $repaired"
3917 mount_client $MOUNT || error "(17) Fail to start client!"
3919 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
3921 ls -ail $MOUNT/.lustre/lost+found/
3923 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3924 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3925 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3927 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3929 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3930 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3932 stat ${cname}/d1 || error "(21) d0 is not recovered"
3933 stat ${cname}/f1 || error "(22) f1 is not recovered"
3935 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3938 [ $MDSCOUNT -lt 2 ] &&
3939 skip "The test needs at least 2 MDTs" && return
3942 echo "For the name entry under a striped directory, if the name"
3943 echo "hash does not match the shard, then the LFSCK will repair"
3944 echo "the bad name entry"
3947 check_mount_and_prep
3949 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3950 error "(1) Fail to create striped directory"
3952 echo "Inject failure stub on client to simulate the case that"
3953 echo "some name entry should be inserted into other non-first"
3954 echo "shard, but inserted into the first shard by wrong"
3956 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3957 $LCTL set_param fail_loc=0x1628 fail_val=0
3958 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3959 error "(2) Fail to create file under striped directory"
3960 $LCTL set_param fail_loc=0 fail_val=0
3962 echo "Trigger namespace LFSCK to repair bad name hash"
3963 $START_NAMESPACE -r -A ||
3964 error "(3) Fail to start LFSCK for namespace"
3966 wait_all_targets_blocked namespace completed 4
3968 local repaired=$($SHOW_NAMESPACE |
3969 awk '/^name_hash_repaired/ { print $2 }')
3970 [ $repaired -ge 1 ] ||
3971 error "(5) Fail to repair bad name hash: $repaired"
3973 umount_client $MOUNT || error "(6) umount failed"
3974 mount_client $MOUNT || error "(7) mount failed"
3976 for ((i = 0; i < $MDSCOUNT; i++)); do
3977 stat $DIR/$tdir/striped_dir/d$i ||
3978 error "(8) Fail to stat d$i after LFSCK"
3979 rmdir $DIR/$tdir/striped_dir/d$i ||
3980 error "(9) Fail to unlink d$i after LFSCK"
3983 rmdir $DIR/$tdir/striped_dir ||
3984 error "(10) Fail to remove the striped directory after LFSCK"
3986 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3989 [ $MDSCOUNT -lt 2 ] &&
3990 skip "The test needs at least 2 MDTs" && return
3993 echo "For the name entry under a striped directory, if the name"
3994 echo "hash does not match the shard, then the LFSCK will repair"
3995 echo "the bad name entry"
3998 check_mount_and_prep
4000 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4001 error "(1) Fail to create striped directory"
4003 echo "Inject failure stub on client to simulate the case that"
4004 echo "some name entry should be inserted into other non-second"
4005 echo "shard, but inserted into the secod shard by wrong"
4007 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4008 $LCTL set_param fail_loc=0x1628 fail_val=1
4009 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4010 error "(2) Fail to create file under striped directory"
4011 $LCTL set_param fail_loc=0 fail_val=0
4013 echo "Trigger namespace LFSCK to repair bad name hash"
4014 $START_NAMESPACE -r -A ||
4015 error "(3) Fail to start LFSCK for namespace"
4017 wait_all_targets_blocked namespace completed 4
4019 local repaired=$(do_facet mds2 $LCTL get_param -n \
4020 mdd.$(facet_svc mds2).lfsck_namespace |
4021 awk '/^name_hash_repaired/ { print $2 }')
4022 [ $repaired -ge 1 ] ||
4023 error "(5) Fail to repair bad name hash: $repaired"
4025 umount_client $MOUNT || error "(6) umount failed"
4026 mount_client $MOUNT || error "(7) mount failed"
4028 for ((i = 0; i < $MDSCOUNT; i++)); do
4029 stat $DIR/$tdir/striped_dir/d$i ||
4030 error "(8) Fail to stat d$i after LFSCK"
4031 rmdir $DIR/$tdir/striped_dir/d$i ||
4032 error "(9) Fail to unlink d$i after LFSCK"
4035 rmdir $DIR/$tdir/striped_dir ||
4036 error "(10) Fail to remove the striped directory after LFSCK"
4038 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4041 [ $MDSCOUNT -lt 2 ] &&
4042 skip "The test needs at least 2 MDTs" && return
4045 echo "For some reason, the master MDT-object of the striped directory"
4046 echo "may lost its master LMV EA. If nobody created files under the"
4047 echo "master directly after the master LMV EA lost, then the LFSCK"
4048 echo "should re-generate the master LMV EA."
4051 check_mount_and_prep
4053 echo "Inject failure stub on MDT0 to simulate the case that the"
4054 echo "master MDT-object of the striped directory lost the LMV EA."
4056 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4057 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4058 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4059 error "(1) Fail to create striped directory"
4060 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4062 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4063 $START_NAMESPACE -r -A ||
4064 error "(2) Fail to start LFSCK for namespace"
4066 wait_all_targets_blocked namespace completed 3
4068 local repaired=$($SHOW_NAMESPACE |
4069 awk '/^striped_dirs_repaired/ { print $2 }')
4070 [ $repaired -eq 1 ] ||
4071 error "(4) Fail to re-generate master LMV EA: $repaired"
4073 umount_client $MOUNT || error "(5) umount failed"
4074 mount_client $MOUNT || error "(6) mount failed"
4076 local empty=$(ls $DIR/$tdir/striped_dir/)
4077 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4079 rmdir $DIR/$tdir/striped_dir ||
4080 error "(8) Fail to remove the striped directory after LFSCK"
4082 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4085 [ $MDSCOUNT -lt 2 ] &&
4086 skip "The test needs at least 2 MDTs" && return
4089 echo "For some reason, the master MDT-object of the striped directory"
4090 echo "may lost its master LMV EA. If somebody created files under the"
4091 echo "master directly after the master LMV EA lost, then the LFSCK"
4092 echo "should NOT re-generate the master LMV EA, instead, it should"
4093 echo "change the broken striped dirctory as read-only to prevent"
4094 echo "further damage"
4097 check_mount_and_prep
4099 echo "Inject failure stub on MDT0 to simulate the case that the"
4100 echo "master MDT-object of the striped directory lost the LMV EA."
4102 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4104 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4105 error "(1) Fail to create striped directory"
4106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4108 umount_client $MOUNT || error "(2) umount failed"
4109 mount_client $MOUNT || error "(3) mount failed"
4111 touch $DIR/$tdir/striped_dir/dummy ||
4112 error "(4) Fail to touch under broken striped directory"
4114 echo "Trigger namespace LFSCK to find out the inconsistency"
4115 $START_NAMESPACE -r -A ||
4116 error "(5) Fail to start LFSCK for namespace"
4118 wait_all_targets_blocked namespace completed 6
4120 local repaired=$($SHOW_NAMESPACE |
4121 awk '/^striped_dirs_repaired/ { print $2 }')
4122 [ $repaired -eq 0 ] ||
4123 error "(7) Re-generate master LMV EA unexpected: $repaired"
4125 stat $DIR/$tdir/striped_dir/dummy ||
4126 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4128 touch $DIR/$tdir/striped_dir/foo &&
4129 error "(9) The broken striped directory should be read-only"
4131 chattr -i $DIR/$tdir/striped_dir ||
4132 error "(10) Fail to chattr on the broken striped directory"
4134 rmdir $DIR/$tdir/striped_dir ||
4135 error "(11) Fail to remove the striped directory after LFSCK"
4137 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4140 [ $MDSCOUNT -lt 2 ] &&
4141 skip "The test needs at least 2 MDTs" && return
4144 echo "For some reason, the slave MDT-object of the striped directory"
4145 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4146 echo "slave LMV EA."
4149 check_mount_and_prep
4151 echo "Inject failure stub on MDT0 to simulate the case that the"
4152 echo "slave MDT-object (that resides on the same MDT as the master"
4153 echo "MDT-object resides on) lost the LMV EA."
4155 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4157 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4158 error "(1) Fail to create striped directory"
4159 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4161 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4162 $START_NAMESPACE -r -A ||
4163 error "(2) Fail to start LFSCK for namespace"
4165 wait_all_targets_blocked namespace completed 3
4167 local repaired=$($SHOW_NAMESPACE |
4168 awk '/^striped_shards_repaired/ { print $2 }')
4169 [ $repaired -eq 1 ] ||
4170 error "(4) Fail to re-generate slave LMV EA: $repaired"
4172 rmdir $DIR/$tdir/striped_dir ||
4173 error "(5) Fail to remove the striped directory after LFSCK"
4175 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4178 [ $MDSCOUNT -lt 2 ] &&
4179 skip "The test needs at least 2 MDTs" && return
4182 echo "For some reason, the slave MDT-object of the striped directory"
4183 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4184 echo "slave LMV EA."
4187 check_mount_and_prep
4189 echo "Inject failure stub on MDT0 to simulate the case that the"
4190 echo "slave MDT-object (that resides on different MDT as the master"
4191 echo "MDT-object resides on) lost the LMV EA."
4193 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4195 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4196 error "(1) Fail to create striped directory"
4197 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4199 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4200 $START_NAMESPACE -r -A ||
4201 error "(2) Fail to start LFSCK for namespace"
4203 wait_all_targets_blocked namespace completed 3
4205 local repaired=$(do_facet mds2 $LCTL get_param -n \
4206 mdd.$(facet_svc mds2).lfsck_namespace |
4207 awk '/^striped_shards_repaired/ { print $2 }')
4208 [ $repaired -eq 1 ] ||
4209 error "(4) Fail to re-generate slave LMV EA: $repaired"
4211 rmdir $DIR/$tdir/striped_dir ||
4212 error "(5) Fail to remove the striped directory after LFSCK"
4214 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4217 [ $MDSCOUNT -lt 2 ] &&
4218 skip "The test needs at least 2 MDTs" && return
4221 echo "For some reason, the stripe index in the slave LMV EA is"
4222 echo "corrupted. The LFSCK should repair the slave LMV EA."
4225 check_mount_and_prep
4227 echo "Inject failure stub on MDT0 to simulate the case that the"
4228 echo "slave LMV EA on the first shard of the striped directory"
4229 echo "claims the same index as the second shard claims"
4231 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4232 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4233 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4234 error "(1) Fail to create striped directory"
4235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4237 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4238 $START_NAMESPACE -r -A ||
4239 error "(2) Fail to start LFSCK for namespace"
4241 wait_all_targets_blocked namespace completed 3
4243 local repaired=$($SHOW_NAMESPACE |
4244 awk '/^striped_shards_repaired/ { print $2 }')
4245 [ $repaired -eq 1 ] ||
4246 error "(4) Fail to repair slave LMV EA: $repaired"
4248 umount_client $MOUNT || error "(5) umount failed"
4249 mount_client $MOUNT || error "(6) mount failed"
4251 touch $DIR/$tdir/striped_dir/foo ||
4252 error "(7) Fail to touch file after the LFSCK"
4254 rm -f $DIR/$tdir/striped_dir/foo ||
4255 error "(8) Fail to unlink file after the LFSCK"
4257 rmdir $DIR/$tdir/striped_dir ||
4258 error "(9) Fail to remove the striped directory after LFSCK"
4260 run_test 31g "Repair the corrupted slave LMV EA"
4263 [ $MDSCOUNT -lt 2 ] &&
4264 skip "The test needs at least 2 MDTs" && return
4267 echo "For some reason, the shard's name entry in the striped"
4268 echo "directory may be corrupted. The LFSCK should repair the"
4269 echo "bad shard's name entry."
4272 check_mount_and_prep
4274 echo "Inject failure stub on MDT0 to simulate the case that the"
4275 echo "first shard's name entry in the striped directory claims"
4276 echo "the same index as the second shard's name entry claims."
4278 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4279 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4280 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4281 error "(1) Fail to create striped directory"
4282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4284 echo "Trigger namespace LFSCK to repair the shard's name entry"
4285 $START_NAMESPACE -r -A ||
4286 error "(2) Fail to start LFSCK for namespace"
4288 wait_all_targets_blocked namespace completed 3
4290 local repaired=$($SHOW_NAMESPACE |
4291 awk '/^dirent_repaired/ { print $2 }')
4292 [ $repaired -eq 1 ] ||
4293 error "(4) Fail to repair shard's name entry: $repaired"
4295 umount_client $MOUNT || error "(5) umount failed"
4296 mount_client $MOUNT || error "(6) mount failed"
4298 touch $DIR/$tdir/striped_dir/foo ||
4299 error "(7) Fail to touch file after the LFSCK"
4301 rm -f $DIR/$tdir/striped_dir/foo ||
4302 error "(8) Fail to unlink file after the LFSCK"
4304 rmdir $DIR/$tdir/striped_dir ||
4305 error "(9) Fail to remove the striped directory after LFSCK"
4307 run_test 31h "Repair the corrupted shard's name entry"
4312 umount_client $MOUNT
4314 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4315 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4316 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4318 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4319 [ "$STATUS" == "scanning-phase1" ] ||
4320 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4323 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4325 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4329 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
4331 run_test 32 "stop LFSCK when some OST failed"
4333 # restore MDS/OST size
4334 MDSSIZE=${SAVED_MDSSIZE}
4335 OSTSIZE=${SAVED_OSTSIZE}
4336 OSTCOUNT=${SAVED_OSTCOUNT}
4338 # cleanup the system at last