3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 # stop MDS to forget last precreated object
1339 echo "stop $SINGLEMDS"
1340 stop $SINGLEMDS > /dev/null || error "(11) Fail to stop MDS!"
1341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1342 echo "start $SINGLEMDS"
1343 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1344 error "(12) Fail to start MDS!"
1346 #define OBD_FAIL_OST_ENOSPC 0x215
1347 do_facet ost1 $LCTL set_param fail_loc=0x215
1349 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1350 error "(2) Fail to start ost1"
1352 for ((i = 0; i < 60; i++)); do
1353 lastid2=$(do_facet ost1 "lctl get_param -n \
1354 obdfilter.${ost1_svc}.last_id" | grep $seq |
1355 awk -F: '{ print $2 }')
1356 [ ! -z $lastid2 ] && break;
1360 echo "the on-disk LAST_ID should be smaller than the expected one"
1361 [ $lastid1 -gt $lastid2 ] ||
1362 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1364 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1365 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1367 wait_update_facet ost1 "$LCTL get_param -n \
1368 obdfilter.${OST_DEV}.lfsck_layout |
1369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1371 error "(6) unexpected status"
1374 stop ost1 || error "(7) Fail to stop ost1"
1376 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1377 error "(8) Fail to start ost1"
1379 echo "the on-disk LAST_ID should have been rebuilt"
1380 wait_update_facet ost1 "$LCTL get_param -n \
1381 obdfilter.${ost1_svc}.last_id | grep $seq |
1382 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1383 do_facet ost1 $LCTL get_param -n \
1384 obdfilter.${ost1_svc}.last_id
1385 error "(9) expect lastid1 $seq:$lastid1"
1388 do_facet ost1 $LCTL set_param fail_loc=0
1389 stopall || error "(10) Fail to stopall"
1391 run_test 11b "LFSCK can rebuild crashed last_id"
1394 [ $MDSCOUNT -lt 2 ] &&
1395 skip "We need at least 2 MDSes for test_12a" && return
1397 check_mount_and_prep
1398 for k in $(seq $MDSCOUNT); do
1399 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1400 createmany -o $DIR/$tdir/${k}/f 100 ||
1401 error "(0) Fail to create 100 files."
1404 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1405 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1406 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1408 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1409 wait_all_targets namespace scanning-phase1 3
1411 echo "Stop namespace LFSCK on all targets by single lctl command."
1412 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1413 error "(4) Fail to stop LFSCK on all devices!"
1415 echo "All the LFSCK targets should be in 'stopped' status."
1416 wait_all_targets_blocked namespace stopped 5
1418 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1419 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1420 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1422 echo "All the LFSCK targets should be in 'completed' status."
1423 wait_all_targets_blocked namespace completed 7
1425 start_full_debug_logging
1427 echo "Start layout LFSCK on all targets by single command (-s 1)."
1428 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1429 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1431 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1432 wait_all_targets layout scanning-phase1 9
1434 echo "Stop layout LFSCK on all targets by single lctl command."
1435 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1436 error "(10) Fail to stop LFSCK on all devices!"
1438 echo "All the LFSCK targets should be in 'stopped' status."
1439 wait_all_targets_blocked layout stopped 11
1441 for k in $(seq $OSTCOUNT); do
1442 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1443 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1444 awk '/^status/ { print $2 }')
1445 [ "$STATUS" == "stopped" ] ||
1446 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1449 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1450 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1451 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1453 echo "All the LFSCK targets should be in 'completed' status."
1454 wait_all_targets_blocked layout completed 14
1456 stop_full_debug_logging
1458 run_test 12a "single command to trigger LFSCK on all devices"
1461 check_mount_and_prep
1463 echo "Start LFSCK without '-M' specified."
1464 do_facet mds1 $LCTL lfsck_start -A -r ||
1465 error "(0) Fail to start LFSCK without '-M'"
1467 wait_all_targets_blocked namespace completed 1
1468 wait_all_targets_blocked layout completed 2
1470 local count=$(do_facet mds1 $LCTL dl |
1471 awk '{ print $3 }' | grep mdt | wc -l)
1472 if [ $count -gt 1 ]; then
1474 echo "Start layout LFSCK on the node with multipe targets,"
1475 echo "but not specify '-M'/'-A' option. Should get failure."
1477 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1478 error "(3) Start layout LFSCK should fail" || true
1481 run_test 12b "auto detect Lustre device"
1485 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1486 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1487 echo "MDT-object FID."
1490 check_mount_and_prep
1492 echo "Inject failure stub to simulate bad lmm_oi"
1493 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1494 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1495 createmany -o $DIR/$tdir/f 32
1496 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1498 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1499 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1502 mdd.${MDT_DEV}.lfsck_layout |
1503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1505 error "(2) unexpected status"
1508 local repaired=$($SHOW_LAYOUT |
1509 awk '/^repaired_others/ { print $2 }')
1510 [ $repaired -eq 32 ] ||
1511 error "(3) Fail to repair crashed lmm_oi: $repaired"
1513 run_test 13 "LFSCK can repair crashed lmm_oi"
1517 echo "The OST-object referenced by the MDT-object should be there;"
1518 echo "otherwise, the LFSCK should re-create the missing OST-object."
1521 check_mount_and_prep
1522 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1524 echo "Inject failure stub to simulate dangling referenced MDT-object"
1525 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1526 do_facet ost1 $LCTL set_param fail_loc=0x1610
1527 local count=$(precreated_ost_obj_count 0 0)
1529 createmany -o $DIR/$tdir/f $((count + 31))
1530 touch $DIR/$tdir/guard
1531 do_facet ost1 $LCTL set_param fail_loc=0
1533 start_full_debug_logging
1535 # exhaust other pre-created dangling cases
1536 count=$(precreated_ost_obj_count 0 0)
1537 createmany -o $DIR/$tdir/a $count ||
1538 error "(0) Fail to create $count files."
1540 echo "'ls' should fail because of dangling referenced MDT-object"
1541 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1543 echo "Trigger layout LFSCK to find out dangling reference"
1544 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1547 mdd.${MDT_DEV}.lfsck_layout |
1548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1550 error "(3) unexpected status"
1553 local repaired=$($SHOW_LAYOUT |
1554 awk '/^repaired_dangling/ { print $2 }')
1555 [ $repaired -ge 32 ] ||
1556 error "(4) Fail to repair dangling reference: $repaired"
1558 echo "'stat' should fail because of not repair dangling by default"
1559 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1561 echo "Trigger layout LFSCK to repair dangling reference"
1562 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1564 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1565 mdd.${MDT_DEV}.lfsck_layout |
1566 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1568 error "(7) unexpected status"
1571 # There may be some async LFSCK updates in processing, wait for
1572 # a while until the target reparation has been done. LU-4970.
1574 echo "'stat' should success after layout LFSCK repairing"
1575 wait_update_facet client "stat $DIR/$tdir/guard |
1576 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1577 stat $DIR/$tdir/guard
1579 error "(8) unexpected size"
1582 repaired=$($SHOW_LAYOUT |
1583 awk '/^repaired_dangling/ { print $2 }')
1584 [ $repaired -ge 32 ] ||
1585 error "(9) Fail to repair dangling reference: $repaired"
1587 stop_full_debug_logging
1589 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1593 echo "If the OST-object referenced by the MDT-object back points"
1594 echo "to some non-exist MDT-object, then the LFSCK should repair"
1595 echo "the OST-object to back point to the right MDT-object."
1598 check_mount_and_prep
1599 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1601 echo "Inject failure stub to make the OST-object to back point to"
1602 echo "non-exist MDT-object."
1603 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1605 do_facet ost1 $LCTL set_param fail_loc=0x1611
1606 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1607 cancel_lru_locks osc
1608 do_facet ost1 $LCTL set_param fail_loc=0
1610 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1611 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1613 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1614 mdd.${MDT_DEV}.lfsck_layout |
1615 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1617 error "(2) unexpected status"
1620 local repaired=$($SHOW_LAYOUT |
1621 awk '/^repaired_unmatched_pair/ { print $2 }')
1622 [ $repaired -eq 1 ] ||
1623 error "(3) Fail to repair unmatched pair: $repaired"
1625 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1629 echo "If the OST-object referenced by the MDT-object back points"
1630 echo "to other MDT-object that doesn't recognize the OST-object,"
1631 echo "then the LFSCK should repair it to back point to the right"
1632 echo "MDT-object (the first one)."
1635 check_mount_and_prep
1636 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1637 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1638 cancel_lru_locks osc
1640 echo "Inject failure stub to make the OST-object to back point to"
1641 echo "other MDT-object"
1643 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1644 do_facet ost1 $LCTL set_param fail_loc=0x1612
1645 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1646 cancel_lru_locks osc
1647 do_facet ost1 $LCTL set_param fail_loc=0
1649 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1650 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1652 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1653 mdd.${MDT_DEV}.lfsck_layout |
1654 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1656 error "(2) unexpected status"
1659 local repaired=$($SHOW_LAYOUT |
1660 awk '/^repaired_unmatched_pair/ { print $2 }')
1661 [ $repaired -eq 1 ] ||
1662 error "(3) Fail to repair unmatched pair: $repaired"
1664 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1667 [ $MDSCOUNT -lt 2 ] &&
1668 skip "We need at least 2 MDSes for this test" && return
1670 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1671 skip "Skip the test after 2.7.55 see LU-6437" && return
1674 echo "According to current metadata migration implementation,"
1675 echo "before the old MDT-object is removed, both the new MDT-object"
1676 echo "and old MDT-object will reference the same LOV layout. Then if"
1677 echo "the layout LFSCK finds the new MDT-object by race, it will"
1678 echo "regard related OST-object(s) as multiple referenced case, and"
1679 echo "will try to create new OST-object(s) for the new MDT-object."
1680 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1681 echo "MDT-object before confirm the multiple referenced case."
1684 check_mount_and_prep
1685 $LFS mkdir -i 1 $DIR/$tdir/a1
1686 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1687 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1688 cancel_lru_locks osc
1690 echo "Inject failure stub on MDT1 to delay the migration"
1692 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1693 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1694 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1695 $LFS migrate -m 0 $DIR/$tdir/a1 &
1698 echo "Trigger layout LFSCK to race with the migration"
1699 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1701 wait_all_targets_blocked layout completed 2
1703 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1704 local repaired=$($SHOW_LAYOUT |
1705 awk '/^repaired_unmatched_pair/ { print $2 }')
1706 [ $repaired -eq 1 ] ||
1707 error "(3) Fail to repair unmatched pair: $repaired"
1709 repaired=$($SHOW_LAYOUT |
1710 awk '/^repaired_multiple_referenced/ { print $2 }')
1711 [ $repaired -eq 0 ] ||
1712 error "(4) Unexpectedly repaird multiple references: $repaired"
1714 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1718 echo "If the OST-object's owner information does not match the owner"
1719 echo "information stored in the MDT-object, then the LFSCK trust the"
1720 echo "MDT-object and update the OST-object's owner information."
1723 check_mount_and_prep
1724 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1725 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1726 cancel_lru_locks osc
1728 echo "Inject failure stub to skip OST-object owner changing"
1729 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1730 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1731 chown 1.1 $DIR/$tdir/f0
1732 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1734 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1737 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1739 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1740 mdd.${MDT_DEV}.lfsck_layout |
1741 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1743 error "(2) unexpected status"
1746 local repaired=$($SHOW_LAYOUT |
1747 awk '/^repaired_inconsistent_owner/ { print $2 }')
1748 [ $repaired -eq 1 ] ||
1749 error "(3) Fail to repair inconsistent owner: $repaired"
1751 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1755 echo "If more than one MDT-objects reference the same OST-object,"
1756 echo "and the OST-object only recognizes one MDT-object, then the"
1757 echo "LFSCK should create new OST-objects for such non-recognized"
1761 check_mount_and_prep
1762 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1764 echo "Inject failure stub to make two MDT-objects to refernce"
1765 echo "the OST-object"
1767 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1768 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1770 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1771 cancel_lru_locks osc
1773 createmany -o $DIR/$tdir/f 1
1775 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1777 cancel_lru_locks mdc
1778 cancel_lru_locks osc
1780 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1781 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1782 [ $size -eq 1048576 ] ||
1783 error "(1) f0 (wrong) size should be 1048576, but got $size"
1785 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1788 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1791 mdd.${MDT_DEV}.lfsck_layout |
1792 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1794 error "(3) unexpected status"
1797 local repaired=$($SHOW_LAYOUT |
1798 awk '/^repaired_multiple_referenced/ { print $2 }')
1799 [ $repaired -eq 1 ] ||
1800 error "(4) Fail to repair multiple references: $repaired"
1802 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1803 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1804 error "(5) Fail to write f0."
1805 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1806 [ $size -eq 1048576 ] ||
1807 error "(6) guard size should be 1048576, but got $size"
1809 run_test 17 "LFSCK can repair multiple references"
1811 $LCTL set_param debug=+cache > /dev/null
1815 echo "The target MDT-object is there, but related stripe information"
1816 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1817 echo "layout EA entries."
1820 check_mount_and_prep
1821 $LFS mkdir -i 0 $DIR/$tdir/a1
1822 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1823 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1825 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1827 $LFS path2fid $DIR/$tdir/a1/f1
1828 $LFS getstripe $DIR/$tdir/a1/f1
1830 if [ $MDSCOUNT -ge 2 ]; then
1831 $LFS mkdir -i 1 $DIR/$tdir/a2
1832 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1833 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1834 $LFS path2fid $DIR/$tdir/a2/f2
1835 $LFS getstripe $DIR/$tdir/a2/f2
1838 cancel_lru_locks osc
1840 echo "Inject failure, to make the MDT-object lost its layout EA"
1841 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1842 do_facet mds1 $LCTL set_param fail_loc=0x1615
1843 chown 1.1 $DIR/$tdir/a1/f1
1845 if [ $MDSCOUNT -ge 2 ]; then
1846 do_facet mds2 $LCTL set_param fail_loc=0x1615
1847 chown 1.1 $DIR/$tdir/a2/f2
1853 do_facet mds1 $LCTL set_param fail_loc=0
1854 if [ $MDSCOUNT -ge 2 ]; then
1855 do_facet mds2 $LCTL set_param fail_loc=0
1858 cancel_lru_locks mdc
1859 cancel_lru_locks osc
1861 echo "The file size should be incorrect since layout EA is lost"
1862 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1863 [ "$cur_size" != "$saved_size" ] ||
1864 error "(1) Expect incorrect file1 size"
1866 if [ $MDSCOUNT -ge 2 ]; then
1867 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1868 [ "$cur_size" != "$saved_size" ] ||
1869 error "(2) Expect incorrect file2 size"
1872 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1873 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1875 for k in $(seq $MDSCOUNT); do
1876 # The LFSCK status query internal is 30 seconds. For the case
1877 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1878 # time to guarantee the status sync up.
1879 wait_update_facet mds${k} "$LCTL get_param -n \
1880 mdd.$(facet_svc mds${k}).lfsck_layout |
1881 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1882 error "(4) MDS${k} is not the expected 'completed'"
1885 for k in $(seq $OSTCOUNT); do
1886 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1887 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1888 awk '/^status/ { print $2 }')
1889 [ "$cur_status" == "completed" ] ||
1890 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1893 local repaired=$(do_facet mds1 $LCTL get_param -n \
1894 mdd.$(facet_svc mds1).lfsck_layout |
1895 awk '/^repaired_orphan/ { print $2 }')
1896 [ $repaired -eq 1 ] ||
1897 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1899 if [ $MDSCOUNT -ge 2 ]; then
1900 repaired=$(do_facet mds2 $LCTL get_param -n \
1901 mdd.$(facet_svc mds2).lfsck_layout |
1902 awk '/^repaired_orphan/ { print $2 }')
1903 [ $repaired -eq 2 ] ||
1904 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1907 $LFS path2fid $DIR/$tdir/a1/f1
1908 $LFS getstripe $DIR/$tdir/a1/f1
1910 if [ $MDSCOUNT -ge 2 ]; then
1911 $LFS path2fid $DIR/$tdir/a2/f2
1912 $LFS getstripe $DIR/$tdir/a2/f2
1915 echo "The file size should be correct after layout LFSCK scanning"
1916 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1917 [ "$cur_size" == "$saved_size" ] ||
1918 error "(7) Expect file1 size $saved_size, but got $cur_size"
1920 if [ $MDSCOUNT -ge 2 ]; then
1921 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1922 [ "$cur_size" == "$saved_size" ] ||
1923 error "(8) Expect file2 size $saved_size, but got $cur_size"
1926 run_test 18a "Find out orphan OST-object and repair it (1)"
1930 echo "The target MDT-object is lost. The LFSCK should re-create the"
1931 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1932 echo "can move it back to normal namespace manually."
1935 check_mount_and_prep
1936 $LFS mkdir -i 0 $DIR/$tdir/a1
1937 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1938 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1939 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1940 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1942 $LFS getstripe $DIR/$tdir/a1/f1
1944 if [ $MDSCOUNT -ge 2 ]; then
1945 $LFS mkdir -i 1 $DIR/$tdir/a2
1946 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1947 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1948 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1950 $LFS getstripe $DIR/$tdir/a2/f2
1953 cancel_lru_locks osc
1955 echo "Inject failure, to simulate the case of missing the MDT-object"
1956 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1957 do_facet mds1 $LCTL set_param fail_loc=0x1616
1958 rm -f $DIR/$tdir/a1/f1
1960 if [ $MDSCOUNT -ge 2 ]; then
1961 do_facet mds2 $LCTL set_param fail_loc=0x1616
1962 rm -f $DIR/$tdir/a2/f2
1968 do_facet mds1 $LCTL set_param fail_loc=0
1969 if [ $MDSCOUNT -ge 2 ]; then
1970 do_facet mds2 $LCTL set_param fail_loc=0
1973 cancel_lru_locks mdc
1974 cancel_lru_locks osc
1976 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1977 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1979 for k in $(seq $MDSCOUNT); do
1980 # The LFSCK status query internal is 30 seconds. For the case
1981 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1982 # time to guarantee the status sync up.
1983 wait_update_facet mds${k} "$LCTL get_param -n \
1984 mdd.$(facet_svc mds${k}).lfsck_layout |
1985 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1986 error "(2) MDS${k} is not the expected 'completed'"
1989 for k in $(seq $OSTCOUNT); do
1990 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1991 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1992 awk '/^status/ { print $2 }')
1993 [ "$cur_status" == "completed" ] ||
1994 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1997 local repaired=$(do_facet mds1 $LCTL get_param -n \
1998 mdd.$(facet_svc mds1).lfsck_layout |
1999 awk '/^repaired_orphan/ { print $2 }')
2000 [ $repaired -eq 1 ] ||
2001 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
2003 if [ $MDSCOUNT -ge 2 ]; then
2004 repaired=$(do_facet mds2 $LCTL get_param -n \
2005 mdd.$(facet_svc mds2).lfsck_layout |
2006 awk '/^repaired_orphan/ { print $2 }')
2007 [ $repaired -eq 2 ] ||
2008 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2011 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2012 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2013 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2015 if [ $MDSCOUNT -ge 2 ]; then
2016 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2017 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2020 $LFS path2fid $DIR/$tdir/a1/f1
2021 $LFS getstripe $DIR/$tdir/a1/f1
2023 if [ $MDSCOUNT -ge 2 ]; then
2024 $LFS path2fid $DIR/$tdir/a2/f2
2025 $LFS getstripe $DIR/$tdir/a2/f2
2028 echo "The file size should be correct after layout LFSCK scanning"
2029 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2030 [ "$cur_size" == "$saved_size" ] ||
2031 error "(7) Expect file1 size $saved_size, but got $cur_size"
2033 if [ $MDSCOUNT -ge 2 ]; then
2034 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2035 [ "$cur_size" == "$saved_size" ] ||
2036 error "(8) Expect file2 size $saved_size, but got $cur_size"
2039 run_test 18b "Find out orphan OST-object and repair it (2)"
2043 echo "The target MDT-object is lost, and the OST-object FID is missing."
2044 echo "The LFSCK should re-create the MDT-object with new FID under the "
2045 echo "directory .lustre/lost+found/MDTxxxx."
2048 check_mount_and_prep
2049 $LFS mkdir -i 0 $DIR/$tdir/a1
2050 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2052 echo "Inject failure, to simulate the case of missing parent FID"
2053 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2054 do_facet ost1 $LCTL set_param fail_loc=0x1617
2056 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2057 $LFS getstripe $DIR/$tdir/a1/f1
2059 if [ $MDSCOUNT -ge 2 ]; then
2060 $LFS mkdir -i 1 $DIR/$tdir/a2
2061 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2062 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2063 $LFS getstripe $DIR/$tdir/a2/f2
2066 cancel_lru_locks osc
2068 echo "Inject failure, to simulate the case of missing the MDT-object"
2069 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2070 do_facet mds1 $LCTL set_param fail_loc=0x1616
2071 rm -f $DIR/$tdir/a1/f1
2073 if [ $MDSCOUNT -ge 2 ]; then
2074 do_facet mds2 $LCTL set_param fail_loc=0x1616
2075 rm -f $DIR/$tdir/a2/f2
2081 do_facet mds1 $LCTL set_param fail_loc=0
2082 if [ $MDSCOUNT -ge 2 ]; then
2083 do_facet mds2 $LCTL set_param fail_loc=0
2086 cancel_lru_locks mdc
2087 cancel_lru_locks osc
2089 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2090 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2092 for k in $(seq $MDSCOUNT); do
2093 # The LFSCK status query internal is 30 seconds. For the case
2094 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2095 # time to guarantee the status sync up.
2096 wait_update_facet mds${k} "$LCTL get_param -n \
2097 mdd.$(facet_svc mds${k}).lfsck_layout |
2098 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2099 error "(2) MDS${k} is not the expected 'completed'"
2102 for k in $(seq $OSTCOUNT); do
2103 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2104 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2105 awk '/^status/ { print $2 }')
2106 [ "$cur_status" == "completed" ] ||
2107 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2110 if [ $MDSCOUNT -ge 2 ]; then
2116 local repaired=$(do_facet mds1 $LCTL get_param -n \
2117 mdd.$(facet_svc mds1).lfsck_layout |
2118 awk '/^repaired_orphan/ { print $2 }')
2119 [ $repaired -eq $expected ] ||
2120 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2122 if [ $MDSCOUNT -ge 2 ]; then
2123 repaired=$(do_facet mds2 $LCTL get_param -n \
2124 mdd.$(facet_svc mds2).lfsck_layout |
2125 awk '/^repaired_orphan/ { print $2 }')
2126 [ $repaired -eq 0 ] ||
2127 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2130 ls -ail $MOUNT/.lustre/lost+found/
2132 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2133 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2134 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2136 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2139 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2140 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2141 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2143 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2144 [ ! -z "$cname" ] ||
2145 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2147 run_test 18c "Find out orphan OST-object and repair it (3)"
2151 echo "The target MDT-object layout EA slot is occpuied by some new"
2152 echo "created OST-object when repair dangling reference case. Such"
2153 echo "conflict OST-object has never been modified. Then when found"
2154 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2158 check_mount_and_prep
2160 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2161 echo "guard" > $DIR/$tdir/a1/f1
2162 echo "foo" > $DIR/$tdir/a1/f2
2163 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2164 $LFS path2fid $DIR/$tdir/a1/f1
2165 $LFS getstripe $DIR/$tdir/a1/f1
2166 $LFS path2fid $DIR/$tdir/a1/f2
2167 $LFS getstripe $DIR/$tdir/a1/f2
2168 cancel_lru_locks osc
2170 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2171 echo "to reference the same OST-object (which is f1's OST-obejct)."
2172 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2173 echo "dangling reference case, but f2's old OST-object is there."
2176 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2177 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2178 chown 1.1 $DIR/$tdir/a1/f2
2179 rm -f $DIR/$tdir/a1/f1
2182 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2184 echo "stopall to cleanup object cache"
2187 setupall > /dev/null
2189 echo "The file size should be incorrect since dangling referenced"
2190 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2191 [ "$cur_size" != "$saved_size" ] ||
2192 error "(1) Expect incorrect file2 size"
2194 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2195 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2197 for k in $(seq $MDSCOUNT); do
2198 # The LFSCK status query internal is 30 seconds. For the case
2199 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2200 # time to guarantee the status sync up.
2201 wait_update_facet mds${k} "$LCTL get_param -n \
2202 mdd.$(facet_svc mds${k}).lfsck_layout |
2203 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2204 error "(3) MDS${k} is not the expected 'completed'"
2207 for k in $(seq $OSTCOUNT); do
2208 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2209 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2210 awk '/^status/ { print $2 }')
2211 [ "$cur_status" == "completed" ] ||
2212 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2215 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2216 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2217 awk '/^repaired_orphan/ { print $2 }')
2218 [ $repaired -eq 1 ] ||
2219 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2221 echo "The file size should be correct after layout LFSCK scanning"
2222 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2223 [ "$cur_size" == "$saved_size" ] ||
2224 error "(6) Expect file2 size $saved_size, but got $cur_size"
2226 echo "The LFSCK should find back the original data."
2227 cat $DIR/$tdir/a1/f2
2228 $LFS path2fid $DIR/$tdir/a1/f2
2229 $LFS getstripe $DIR/$tdir/a1/f2
2231 run_test 18d "Find out orphan OST-object and repair it (4)"
2235 echo "The target MDT-object layout EA slot is occpuied by some new"
2236 echo "created OST-object when repair dangling reference case. Such"
2237 echo "conflict OST-object has been modified by others. To keep the"
2238 echo "new data, the LFSCK will create a new file to refernece this"
2239 echo "old orphan OST-object."
2242 check_mount_and_prep
2244 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2245 echo "guard" > $DIR/$tdir/a1/f1
2246 echo "foo" > $DIR/$tdir/a1/f2
2247 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2248 $LFS path2fid $DIR/$tdir/a1/f1
2249 $LFS getstripe $DIR/$tdir/a1/f1
2250 $LFS path2fid $DIR/$tdir/a1/f2
2251 $LFS getstripe $DIR/$tdir/a1/f2
2252 cancel_lru_locks osc
2254 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2255 echo "to reference the same OST-object (which is f1's OST-obejct)."
2256 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2257 echo "dangling reference case, but f2's old OST-object is there."
2260 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2261 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2262 chown 1.1 $DIR/$tdir/a1/f2
2263 rm -f $DIR/$tdir/a1/f1
2266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2268 echo "stopall to cleanup object cache"
2271 setupall > /dev/null
2273 echo "The file size should be incorrect since dangling referenced"
2274 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2275 [ "$cur_size" != "$saved_size" ] ||
2276 error "(1) Expect incorrect file2 size"
2278 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2279 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2281 start_full_debug_logging
2283 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2284 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2286 wait_update_facet mds1 "$LCTL get_param -n \
2287 mdd.$(facet_svc mds1).lfsck_layout |
2288 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2289 error "(3) MDS1 is not the expected 'scanning-phase2'"
2291 # to guarantee all updates are synced.
2295 echo "Write new data to f2 to modify the new created OST-object."
2296 echo "dummy" >> $DIR/$tdir/a1/f2
2298 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2300 for k in $(seq $MDSCOUNT); do
2301 # The LFSCK status query internal is 30 seconds. For the case
2302 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2303 # time to guarantee the status sync up.
2304 wait_update_facet mds${k} "$LCTL get_param -n \
2305 mdd.$(facet_svc mds${k}).lfsck_layout |
2306 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2307 error "(4) MDS${k} is not the expected 'completed'"
2310 for k in $(seq $OSTCOUNT); do
2311 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2312 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2313 awk '/^status/ { print $2 }')
2314 [ "$cur_status" == "completed" ] ||
2315 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2318 stop_full_debug_logging
2320 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2321 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2322 awk '/^repaired_orphan/ { print $2 }')
2323 [ $repaired -eq 1 ] ||
2324 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2326 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2327 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2328 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2330 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2331 [ ! -z "$cname" ] ||
2332 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2334 echo "The stub file should keep the original f2 data"
2335 cur_size=$(ls -il $cname | awk '{ print $6 }')
2336 [ "$cur_size" == "$saved_size" ] ||
2337 error "(9) Expect file2 size $saved_size, but got $cur_size"
2340 $LFS path2fid $cname
2341 $LFS getstripe $cname
2343 echo "The f2 should contains new data."
2344 cat $DIR/$tdir/a1/f2
2345 $LFS path2fid $DIR/$tdir/a1/f2
2346 $LFS getstripe $DIR/$tdir/a1/f2
2348 run_test 18e "Find out orphan OST-object and repair it (5)"
2351 [ $OSTCOUNT -lt 2 ] &&
2352 skip "The test needs at least 2 OSTs" && return
2355 echo "The target MDT-object is lost. The LFSCK should re-create the"
2356 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2357 echo "to verify some OST-object(s) during the first stage-scanning,"
2358 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2359 echo "should not be affected."
2362 check_mount_and_prep
2363 $LFS mkdir -i 0 $DIR/$tdir/a1
2364 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2365 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2366 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2367 $LFS mkdir -i 0 $DIR/$tdir/a2
2368 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2369 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2370 $LFS getstripe $DIR/$tdir/a1/f1
2371 $LFS getstripe $DIR/$tdir/a2/f2
2373 if [ $MDSCOUNT -ge 2 ]; then
2374 $LFS mkdir -i 1 $DIR/$tdir/a3
2375 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2376 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2377 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2378 $LFS mkdir -i 1 $DIR/$tdir/a4
2379 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2380 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2381 $LFS getstripe $DIR/$tdir/a3/f3
2382 $LFS getstripe $DIR/$tdir/a4/f4
2385 cancel_lru_locks osc
2387 echo "Inject failure, to simulate the case of missing the MDT-object"
2388 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2389 do_facet mds1 $LCTL set_param fail_loc=0x1616
2390 rm -f $DIR/$tdir/a1/f1
2391 rm -f $DIR/$tdir/a2/f2
2393 if [ $MDSCOUNT -ge 2 ]; then
2394 do_facet mds2 $LCTL set_param fail_loc=0x1616
2395 rm -f $DIR/$tdir/a3/f3
2396 rm -f $DIR/$tdir/a4/f4
2402 do_facet mds1 $LCTL set_param fail_loc=0
2403 if [ $MDSCOUNT -ge 2 ]; then
2404 do_facet mds2 $LCTL set_param fail_loc=0
2407 cancel_lru_locks mdc
2408 cancel_lru_locks osc
2410 echo "Inject failure, to simulate the OST0 fail to handle"
2411 echo "MDT0 LFSCK request during the first-stage scanning."
2412 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2413 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2415 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2416 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2418 for k in $(seq $MDSCOUNT); do
2419 # The LFSCK status query internal is 30 seconds. For the case
2420 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2421 # time to guarantee the status sync up.
2422 wait_update_facet mds${k} "$LCTL get_param -n \
2423 mdd.$(facet_svc mds${k}).lfsck_layout |
2424 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2425 error "(2) MDS${k} is not the expected 'partial'"
2428 wait_update_facet ost1 "$LCTL get_param -n \
2429 obdfilter.$(facet_svc ost1).lfsck_layout |
2430 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2431 error "(3) OST1 is not the expected 'partial'"
2434 wait_update_facet ost2 "$LCTL get_param -n \
2435 obdfilter.$(facet_svc ost2).lfsck_layout |
2436 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2437 error "(4) OST2 is not the expected 'completed'"
2440 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2442 local repaired=$(do_facet mds1 $LCTL get_param -n \
2443 mdd.$(facet_svc mds1).lfsck_layout |
2444 awk '/^repaired_orphan/ { print $2 }')
2445 [ $repaired -eq 1 ] ||
2446 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2448 if [ $MDSCOUNT -ge 2 ]; then
2449 repaired=$(do_facet mds2 $LCTL get_param -n \
2450 mdd.$(facet_svc mds2).lfsck_layout |
2451 awk '/^repaired_orphan/ { print $2 }')
2452 [ $repaired -eq 1 ] ||
2453 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2456 echo "Trigger layout LFSCK on all devices again to cleanup"
2457 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2459 for k in $(seq $MDSCOUNT); do
2460 # The LFSCK status query internal is 30 seconds. For the case
2461 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2462 # time to guarantee the status sync up.
2463 wait_update_facet mds${k} "$LCTL get_param -n \
2464 mdd.$(facet_svc mds${k}).lfsck_layout |
2465 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2466 error "(8) MDS${k} is not the expected 'completed'"
2469 for k in $(seq $OSTCOUNT); do
2470 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2471 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2472 awk '/^status/ { print $2 }')
2473 [ "$cur_status" == "completed" ] ||
2474 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2478 local repaired=$(do_facet mds1 $LCTL get_param -n \
2479 mdd.$(facet_svc mds1).lfsck_layout |
2480 awk '/^repaired_orphan/ { print $2 }')
2481 [ $repaired -eq 2 ] ||
2482 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2484 if [ $MDSCOUNT -ge 2 ]; then
2485 repaired=$(do_facet mds2 $LCTL get_param -n \
2486 mdd.$(facet_svc mds2).lfsck_layout |
2487 awk '/^repaired_orphan/ { print $2 }')
2488 [ $repaired -eq 2 ] ||
2489 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2492 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2496 echo "The target MDT-object is lost, but related OI mapping is there"
2497 echo "The LFSCK should recreate the lost MDT-object without affected"
2498 echo "by the stale OI mapping."
2501 check_mount_and_prep
2502 $LFS mkdir -i 0 $DIR/$tdir/a1
2503 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2504 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2505 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2507 $LFS getstripe $DIR/$tdir/a1/f1
2508 cancel_lru_locks osc
2510 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2511 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2512 do_facet mds1 $LCTL set_param fail_loc=0x162e
2513 rm -f $DIR/$tdir/a1/f1
2515 do_facet mds1 $LCTL set_param fail_loc=0
2516 cancel_lru_locks mdc
2517 cancel_lru_locks osc
2519 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2520 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2522 for k in $(seq $MDSCOUNT); do
2523 # The LFSCK status query internal is 30 seconds. For the case
2524 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2525 # time to guarantee the status sync up.
2526 wait_update_facet mds${k} "$LCTL get_param -n \
2527 mdd.$(facet_svc mds${k}).lfsck_layout |
2528 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2529 error "(2) MDS${k} is not the expected 'completed'"
2532 for k in $(seq $OSTCOUNT); do
2533 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2534 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2535 awk '/^status/ { print $2 }')
2536 [ "$cur_status" == "completed" ] ||
2537 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2540 local repaired=$(do_facet mds1 $LCTL get_param -n \
2541 mdd.$(facet_svc mds1).lfsck_layout |
2542 awk '/^repaired_orphan/ { print $2 }')
2543 [ $repaired -eq $OSTCOUNT ] ||
2544 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2546 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2547 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2548 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2550 $LFS path2fid $DIR/$tdir/a1/f1
2551 $LFS getstripe $DIR/$tdir/a1/f1
2553 run_test 18g "Find out orphan OST-object and repair it (7)"
2555 $LCTL set_param debug=-cache > /dev/null
2558 check_mount_and_prep
2559 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2561 echo "foo" > $DIR/$tdir/a0
2562 echo "guard" > $DIR/$tdir/a1
2563 cancel_lru_locks osc
2565 echo "Inject failure, then client will offer wrong parent FID when read"
2566 do_facet ost1 $LCTL set_param -n \
2567 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2568 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2569 $LCTL set_param fail_loc=0x1619
2571 echo "Read RPC with wrong parent FID should be denied"
2572 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2573 $LCTL set_param fail_loc=0
2575 run_test 19a "OST-object inconsistency self detect"
2578 check_mount_and_prep
2579 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2581 echo "Inject failure stub to make the OST-object to back point to"
2582 echo "non-exist MDT-object"
2584 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2585 do_facet ost1 $LCTL set_param fail_loc=0x1611
2586 echo "foo" > $DIR/$tdir/f0
2587 cancel_lru_locks osc
2588 do_facet ost1 $LCTL set_param fail_loc=0
2590 echo "Nothing should be fixed since self detect and repair is disabled"
2591 local repaired=$(do_facet ost1 $LCTL get_param -n \
2592 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2593 awk '/^repaired/ { print $2 }')
2594 [ $repaired -eq 0 ] ||
2595 error "(1) Expected 0 repaired, but got $repaired"
2597 echo "Read RPC with right parent FID should be accepted,"
2598 echo "and cause parent FID on OST to be fixed"
2600 do_facet ost1 $LCTL set_param -n \
2601 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2602 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2604 repaired=$(do_facet ost1 $LCTL get_param -n \
2605 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2606 awk '/^repaired/ { print $2 }')
2607 [ $repaired -eq 1 ] ||
2608 error "(3) Expected 1 repaired, but got $repaired"
2610 run_test 19b "OST-object inconsistency self repair"
2613 [ $OSTCOUNT -lt 2 ] &&
2614 skip "The test needs at least 2 OSTs" && return
2617 echo "The target MDT-object and some of its OST-object are lost."
2618 echo "The LFSCK should find out the left OST-objects and re-create"
2619 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2620 echo "with the partial OST-objects (LOV EA hole)."
2622 echo "New client can access the file with LOV EA hole via normal"
2623 echo "system tools or commands without crash the system."
2625 echo "For old client, even though it cannot access the file with"
2626 echo "LOV EA hole, it should not cause the system crash."
2629 check_mount_and_prep
2630 $LFS mkdir -i 0 $DIR/$tdir/a1
2631 if [ $OSTCOUNT -gt 2 ]; then
2632 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2635 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2639 # 256 blocks on the stripe0.
2640 # 1 block on the stripe1 for 2 OSTs case.
2641 # 256 blocks on the stripe1 for other cases.
2642 # 1 block on the stripe2 if OSTs > 2
2643 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2644 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2645 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2647 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2648 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2649 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2652 $LFS getstripe $DIR/$tdir/a1/f0
2654 $LFS getstripe $DIR/$tdir/a1/f1
2656 $LFS getstripe $DIR/$tdir/a1/f2
2658 if [ $OSTCOUNT -gt 2 ]; then
2659 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2660 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2662 $LFS getstripe $DIR/$tdir/a1/f3
2665 cancel_lru_locks osc
2667 echo "Inject failure..."
2668 echo "To simulate f0 lost MDT-object"
2669 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2670 do_facet mds1 $LCTL set_param fail_loc=0x1616
2671 rm -f $DIR/$tdir/a1/f0
2673 echo "To simulate f1 lost MDT-object and OST-object0"
2674 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2675 do_facet mds1 $LCTL set_param fail_loc=0x161a
2676 rm -f $DIR/$tdir/a1/f1
2678 echo "To simulate f2 lost MDT-object and OST-object1"
2679 do_facet mds1 $LCTL set_param fail_val=1
2680 rm -f $DIR/$tdir/a1/f2
2682 if [ $OSTCOUNT -gt 2 ]; then
2683 echo "To simulate f3 lost MDT-object and OST-object2"
2684 do_facet mds1 $LCTL set_param fail_val=2
2685 rm -f $DIR/$tdir/a1/f3
2688 umount_client $MOUNT
2691 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2693 echo "Inject failure to slow down the LFSCK on OST0"
2694 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2695 do_facet ost1 $LCTL set_param fail_loc=0x161b
2697 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2698 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2701 do_facet ost1 $LCTL set_param fail_loc=0
2703 for k in $(seq $MDSCOUNT); do
2704 # The LFSCK status query internal is 30 seconds. For the case
2705 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2706 # time to guarantee the status sync up.
2707 wait_update_facet mds${k} "$LCTL get_param -n \
2708 mdd.$(facet_svc mds${k}).lfsck_layout |
2709 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2710 error "(2) MDS${k} is not the expected 'completed'"
2713 for k in $(seq $OSTCOUNT); do
2714 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2715 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2716 awk '/^status/ { print $2 }')
2717 [ "$cur_status" == "completed" ] ||
2718 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2721 local repaired=$(do_facet mds1 $LCTL get_param -n \
2722 mdd.$(facet_svc mds1).lfsck_layout |
2723 awk '/^repaired_orphan/ { print $2 }')
2724 if [ $OSTCOUNT -gt 2 ]; then
2725 [ $repaired -eq 9 ] ||
2726 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2728 [ $repaired -eq 4 ] ||
2729 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2732 mount_client $MOUNT || error "(5.0) Fail to start client!"
2734 LOV_PATTERN_F_HOLE=0x40000000
2737 # ${fid0}-R-0 is the old f0
2739 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2740 echo "Check $name, which is the old f0"
2742 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2744 local pattern=0x$($LFS getstripe -L $name)
2745 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2746 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2748 local stripes=$($LFS getstripe -c $name)
2749 if [ $OSTCOUNT -gt 2 ]; then
2750 [ $stripes -eq 3 ] ||
2751 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2753 [ $stripes -eq 2 ] ||
2754 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2757 local size=$(stat $name | awk '/Size:/ { print $2 }')
2758 [ $size -eq $((4096 * $bcount)) ] ||
2759 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2761 cat $name > /dev/null || error "(5.5) cannot read $name"
2763 echo "dummy" >> $name || error "(5.6) cannot write $name"
2765 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2767 touch $name || error "(5.8) cannot touch $name"
2769 rm -f $name || error "(5.9) cannot unlink $name"
2772 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2774 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2775 if [ $OSTCOUNT -gt 2 ]; then
2776 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2778 echo "Check $name, it contains the old f1's stripe1"
2781 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2783 pattern=0x$($LFS getstripe -L $name)
2784 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2785 error "(6.2) expect pattern flag hole, but got $pattern"
2787 stripes=$($LFS getstripe -c $name)
2788 if [ $OSTCOUNT -gt 2 ]; then
2789 [ $stripes -eq 3 ] ||
2790 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2792 [ $stripes -eq 2 ] ||
2793 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2796 size=$(stat $name | awk '/Size:/ { print $2 }')
2797 [ $size -eq $((4096 * $bcount)) ] ||
2798 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2800 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2802 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2803 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2806 [ $failures -eq 256 ] ||
2807 error "(6.6) expect 256 IO failures, but get $failures"
2809 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2810 [ $size -eq $((4096 * $bcount)) ] ||
2811 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2813 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2814 error "(6.8) write to the LOV EA hole should fail"
2816 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2817 error "(6.9) write to normal stripe should NOT fail"
2819 echo "foo" >> $name && error "(6.10) append write $name should fail"
2821 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2823 touch $name || error "(6.12) cannot touch $name"
2825 rm -f $name || error "(6.13) cannot unlink $name"
2828 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2830 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2831 if [ $OSTCOUNT -gt 2 ]; then
2832 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2834 echo "Check $name, it contains the old f2's stripe0"
2837 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2839 pattern=0x$($LFS getstripe -L $name)
2840 stripes=$($LFS getstripe -c $name)
2841 size=$(stat $name | awk '/Size:/ { print $2 }')
2842 if [ $OSTCOUNT -gt 2 ]; then
2843 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2844 error "(7.2.1) expect pattern flag hole, but got $pattern"
2846 [ $stripes -eq 3 ] ||
2847 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2849 [ $size -eq $((4096 * $bcount)) ] ||
2850 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2852 cat $name > /dev/null &&
2853 error "(7.5.1) normal read $name should fail"
2855 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2856 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2858 [ $failures -eq 256 ] ||
2859 error "(7.6) expect 256 IO failures, but get $failures"
2861 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2862 [ $size -eq $((4096 * $bcount)) ] ||
2863 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2865 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2866 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2868 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2869 error "(7.8.1) write to normal stripe should NOT fail"
2871 echo "foo" >> $name &&
2872 error "(7.8.3) append write $name should fail"
2874 chown $RUNAS_ID:$RUNAS_GID $name ||
2875 error "(7.9.1) cannot chown on $name"
2877 touch $name || error "(7.10.1) cannot touch $name"
2879 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2880 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2882 [ $stripes -eq 1 ] ||
2883 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2886 [ $size -eq $((4096 * (256 + 0))) ] ||
2887 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2889 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2891 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2893 chown $RUNAS_ID:$RUNAS_GID $name ||
2894 error "(7.9.2) cannot chown on $name"
2896 touch $name || error "(7.10.2) cannot touch $name"
2899 rm -f $name || error "(7.11) cannot unlink $name"
2901 [ $OSTCOUNT -le 2 ] && return
2904 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2906 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2907 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2909 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2911 pattern=0x$($LFS getstripe -L $name)
2912 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2913 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2915 stripes=$($LFS getstripe -c $name)
2916 # LFSCK does not know the old f3 had 3 stripes.
2917 # It only tries to find as much as possible.
2918 # The stripe count depends on the last stripe's offset.
2919 [ $stripes -eq 2 ] ||
2920 error "(8.3) expect the stripe count is 2, but got $stripes"
2922 size=$(stat $name | awk '/Size:/ { print $2 }')
2924 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2925 error "(8.4) expect the size $((4096 * 512)), but got $size"
2927 cat $name > /dev/null || error "(8.5) cannot read $name"
2929 echo "dummy" >> $name || error "(8.6) cannot write $name"
2931 chown $RUNAS_ID:$RUNAS_GID $name ||
2932 error "(8.7) cannot chown on $name"
2934 touch $name || error "(8.8) cannot touch $name"
2936 rm -f $name || error "(8.9) cannot unlink $name"
2938 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2941 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2942 skip "ignore the test if MDS is older than 2.5.59" && return
2944 check_mount_and_prep
2945 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2947 echo "Start all LFSCK components by default (-s 1)"
2948 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2949 error "Fail to start LFSCK"
2951 echo "namespace LFSCK should be in 'scanning-phase1' status"
2952 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2953 [ "$STATUS" == "scanning-phase1" ] ||
2954 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2956 echo "layout LFSCK should be in 'scanning-phase1' status"
2957 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2958 [ "$STATUS" == "scanning-phase1" ] ||
2959 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2961 echo "Stop all LFSCK components by default"
2962 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2963 error "Fail to stop LFSCK"
2965 run_test 21 "run all LFSCK components by default"
2968 [ $MDSCOUNT -lt 2 ] &&
2969 skip "We need at least 2 MDSes for this test" && return
2972 echo "The parent_A references the child directory via some name entry,"
2973 echo "but the child directory back references another parent_B via its"
2974 echo "".." name entry. The parent_B does not exist. Then the namespace"
2975 echo "LFSCK will repair the child directory's ".." name entry."
2978 check_mount_and_prep
2980 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2981 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2983 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2984 echo "The dummy's dotdot name entry references the guard."
2985 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2986 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2987 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2988 error "(3) Fail to mkdir on MDT0"
2989 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2991 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2993 echo "Trigger namespace LFSCK to repair unmatched pairs"
2994 $START_NAMESPACE -A -r ||
2995 error "(5) Fail to start LFSCK for namespace"
2997 wait_all_targets_blocked namespace completed 6
2999 local repaired=$($SHOW_NAMESPACE |
3000 awk '/^unmatched_pairs_repaired/ { print $2 }')
3001 [ $repaired -eq 1 ] ||
3002 error "(7) Fail to repair unmatched pairs: $repaired"
3004 echo "'ls' should success after namespace LFSCK repairing"
3005 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3006 error "(8) ls should success."
3008 run_test 22a "LFSCK can repair unmatched pairs (1)"
3011 [ $MDSCOUNT -lt 2 ] &&
3012 skip "We need at least 2 MDSes for this test" && return
3015 echo "The parent_A references the child directory via the name entry_B,"
3016 echo "but the child directory back references another parent_C via its"
3017 echo "".." name entry. The parent_C exists, but there is no the name"
3018 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3019 echo "the child directory's ".." name entry and its linkEA."
3022 check_mount_and_prep
3024 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3025 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3027 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3028 echo "and bad linkEA. The dummy's dotdot name entry references the"
3029 echo "guard. The dummy's linkEA references n non-exist name entry."
3030 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3032 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3033 error "(3) Fail to mkdir on MDT0"
3034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3036 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3037 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3038 local dummyname=$($LFS fid2path $DIR $dummyfid)
3039 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3040 error "(4) fid2path works unexpectedly."
3042 echo "Trigger namespace LFSCK to repair unmatched pairs"
3043 $START_NAMESPACE -A -r ||
3044 error "(5) Fail to start LFSCK for namespace"
3046 wait_all_targets_blocked namespace completed 6
3048 local repaired=$($SHOW_NAMESPACE |
3049 awk '/^unmatched_pairs_repaired/ { print $2 }')
3050 [ $repaired -eq 1 ] ||
3051 error "(7) Fail to repair unmatched pairs: $repaired"
3053 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3054 local dummyname=$($LFS fid2path $DIR $dummyfid)
3055 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3056 error "(8) fid2path does not work"
3058 run_test 22b "LFSCK can repair unmatched pairs (2)"
3061 [ $MDSCOUNT -lt 2 ] &&
3062 skip "We need at least 2 MDSes for this test" && return
3065 echo "The name entry is there, but the MDT-object for such name "
3066 echo "entry does not exist. The namespace LFSCK should find out "
3067 echo "and repair the inconsistency as required."
3070 check_mount_and_prep
3072 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3073 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3075 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3076 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3077 do_facet mds2 $LCTL set_param fail_loc=0x1620
3078 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3079 do_facet mds2 $LCTL set_param fail_loc=0
3081 echo "'ls' should fail because of dangling name entry"
3082 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3084 echo "Trigger namespace LFSCK to find out dangling name entry"
3085 $START_NAMESPACE -A -r ||
3086 error "(5) Fail to start LFSCK for namespace"
3088 wait_all_targets_blocked namespace completed 6
3090 local repaired=$($SHOW_NAMESPACE |
3091 awk '/^dangling_repaired/ { print $2 }')
3092 [ $repaired -eq 1 ] ||
3093 error "(7) Fail to repair dangling name entry: $repaired"
3095 echo "'ls' should fail because not re-create MDT-object by default"
3096 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3098 echo "Trigger namespace LFSCK again to repair dangling name entry"
3099 $START_NAMESPACE -A -r -C ||
3100 error "(9) Fail to start LFSCK for namespace"
3102 wait_all_targets_blocked namespace completed 10
3104 repaired=$($SHOW_NAMESPACE |
3105 awk '/^dangling_repaired/ { print $2 }')
3106 [ $repaired -eq 1 ] ||
3107 error "(11) Fail to repair dangling name entry: $repaired"
3109 echo "'ls' should success after namespace LFSCK repairing"
3110 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3112 run_test 23a "LFSCK can repair dangling name entry (1)"
3116 echo "The objectA has multiple hard links, one of them corresponding"
3117 echo "to the name entry_B. But there is something wrong for the name"
3118 echo "entry_B and cause entry_B to references non-exist object_C."
3119 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3120 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3121 echo "comes to the second-stage scanning, it will find that the"
3122 echo "former re-creating object_C is not proper, and will try to"
3123 echo "replace the object_C with the real object_A."
3126 check_mount_and_prep
3128 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3129 # Trigger LFSCK firstly, that will generate the
3130 # .lustre/lost+found/MDTxxxx in advance to avoid
3131 # reusing the local object for the dangling name
3133 $START_NAMESPACE -r ||
3134 error "(0) Fail to start LFSCK for namespace"
3136 wait_all_targets_blocked namespace completed 0.1
3139 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3140 $LFS path2fid $DIR/$tdir/d0
3142 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3143 $LFS path2fid $DIR/$tdir/d0/f0
3145 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3146 $LFS path2fid $DIR/$tdir/d0/f1
3148 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3149 OID=$(printf %d $OID)
3151 if [ $OID -eq 1 ]; then
3152 # To guarantee that the f0 and f1 are in the same FID seq
3153 rm -f $DIR/$tdir/d0/f0 ||
3154 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3155 echo "dummy" > $DIR/$tdir/d0/f0 ||
3156 error "(3.2) Fail to touch on MDT0"
3157 $LFS path2fid $DIR/$tdir/d0/f0
3160 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3161 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3162 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3163 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3164 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3166 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3168 echo "'ls' should fail because of dangling name entry"
3169 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3170 error "(6) ls should fail."
3172 echo "Trigger namespace LFSCK to find out dangling name entry"
3173 $START_NAMESPACE -r -C ||
3174 error "(7) Fail to start LFSCK for namespace"
3176 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3177 mdd.${MDT_DEV}.lfsck_namespace |
3178 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3180 error "(8) unexpected status"
3183 local repaired=$($SHOW_NAMESPACE |
3184 awk '/^dangling_repaired/ { print $2 }')
3185 [ $repaired -eq 1 ] ||
3186 error "(9) Fail to repair dangling name entry: $repaired"
3188 repaired=$($SHOW_NAMESPACE |
3189 awk '/^multiple_linked_repaired/ { print $2 }')
3190 [ $repaired -eq 1 ] ||
3191 error "(10) Fail to drop the former created object: $repaired"
3193 local data=$(cat $DIR/$tdir/d0/foo)
3194 [ "$data" == "dummy" ] ||
3195 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3197 run_test 23b "LFSCK can repair dangling name entry (2)"
3201 echo "The objectA has multiple hard links, one of them corresponding"
3202 echo "to the name entry_B. But there is something wrong for the name"
3203 echo "entry_B and cause entry_B to references non-exist object_C."
3204 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3205 echo "as dangling, and re-create the lost object_C. And then others"
3206 echo "modified the re-created object_C. When the LFSCK comes to the"
3207 echo "second-stage scanning, it will find that the former re-creating"
3208 echo "object_C maybe wrong and try to replace the object_C with the"
3209 echo "real object_A. But because object_C has been modified, so the"
3210 echo "LFSCK cannot replace it."
3213 start_full_debug_logging
3215 check_mount_and_prep
3217 [[ -d $MOUNT/.lustre/lost+found/MDT0000 ]] || {
3218 # Trigger LFSCK firstly, that will generate the
3219 # .lustre/lost+found/MDTxxxx in advance to avoid
3220 # reusing the local object for the dangling name
3222 $START_NAMESPACE -r ||
3223 error "(0) Fail to start LFSCK for namespace"
3225 wait_all_targets_blocked namespace completed 0.1
3228 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3229 $LFS path2fid $DIR/$tdir/d0
3231 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3232 $LFS path2fid $DIR/$tdir/d0/f0
3234 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3235 $LFS path2fid $DIR/$tdir/d0/f1
3237 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3238 OID=$(printf %d $OID)
3240 if [ $OID -eq 1 ]; then
3241 # To guarantee that the f0 and f1 are in the same FID seq
3242 rm -f $DIR/$tdir/d0/f0 ||
3243 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3244 echo "dummy" > $DIR/$tdir/d0/f0 ||
3245 error "(3.2) Fail to touch on MDT0"
3246 $LFS path2fid $DIR/$tdir/d0/f0
3249 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3250 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3251 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3252 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3253 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3255 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3257 echo "'ls' should fail because of dangling name entry"
3258 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3259 error "(6) ls should fail."
3261 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3262 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3264 echo "Trigger namespace LFSCK to find out dangling name entry"
3265 $START_NAMESPACE -r -C ||
3266 error "(7) Fail to start LFSCK for namespace"
3268 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3269 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3270 stat $DIR/$tdir/guard
3272 error "(8) unexpected size"
3275 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3276 cancel_lru_locks osc
3278 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3279 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3280 mdd.${MDT_DEV}.lfsck_namespace |
3281 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3283 error "(10) unexpected status"
3286 stop_full_debug_logging
3288 local repaired=$($SHOW_NAMESPACE |
3289 awk '/^dangling_repaired/ { print $2 }')
3290 [ $repaired -eq 1 ] ||
3291 error "(11) Fail to repair dangling name entry: $repaired"
3293 local data=$(cat $DIR/$tdir/d0/foo)
3294 [ "$data" != "dummy" ] ||
3295 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3297 run_test 23c "LFSCK can repair dangling name entry (3)"
3300 [ $MDSCOUNT -lt 2 ] &&
3301 skip "We need at least 2 MDSes for this test" && return
3304 echo "Two MDT-objects back reference the same name entry via their"
3305 echo "each own linkEA entry, but the name entry only references one"
3306 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3307 echo "for the MDT-object that is not recognized. If such MDT-object"
3308 echo "has no other linkEA entry after the removing, then the LFSCK"
3309 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3312 check_mount_and_prep
3314 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3316 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3317 $LFS path2fid $DIR/$tdir/d0/guard
3319 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3320 $LFS path2fid $DIR/$tdir/d0/dummy
3323 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3324 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3326 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3329 touch $DIR/$tdir/d0/guard/foo ||
3330 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3332 echo "Inject failure stub on MDT0 to simulate the case that"
3333 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3334 echo "that references $DIR/$tdir/d0/guard/foo."
3335 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3336 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3337 echo "there with the same linkEA entry as another MDT-object"
3338 echo "$DIR/$tdir/d0/guard/foo has"
3340 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3342 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3343 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3344 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3345 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3346 rmdir $DIR/$tdir/d0/dummy/foo ||
3347 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3348 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3350 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3351 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3352 error "(6) stat successfully unexpectedly"
3354 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3355 $START_NAMESPACE -A -r ||
3356 error "(7) Fail to start LFSCK for namespace"
3358 wait_all_targets_blocked namespace completed 8
3360 local repaired=$($SHOW_NAMESPACE |
3361 awk '/^multiple_referenced_repaired/ { print $2 }')
3362 [ $repaired -eq 1 ] ||
3363 error "(9) Fail to repair multiple referenced name entry: $repaired"
3365 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3366 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3367 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3369 local cname="$cfid-$pfid-D-0"
3370 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3371 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3373 run_test 24 "LFSCK can repair multiple-referenced name entry"
3376 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3377 skip "Only support to inject failure on ldiskfs" && return
3380 echo "The file type in the name entry does not match the file type"
3381 echo "claimed by the referenced object. Then the LFSCK will update"
3382 echo "the file type in the name entry."
3385 check_mount_and_prep
3387 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3389 echo "Inject failure stub on MDT0 to simulate the case that"
3390 echo "the file type stored in the name entry is wrong."
3392 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3393 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3394 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3397 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3398 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3400 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3401 mdd.${MDT_DEV}.lfsck_namespace |
3402 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3404 error "(4) unexpected status"
3407 local repaired=$($SHOW_NAMESPACE |
3408 awk '/^bad_file_type_repaired/ { print $2 }')
3409 [ $repaired -eq 1 ] ||
3410 error "(5) Fail to repair bad file type in name entry: $repaired"
3412 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3414 run_test 25 "LFSCK can repair bad file type in the name entry"
3418 echo "The local name entry back referenced by the MDT-object is lost."
3419 echo "The namespace LFSCK will add the missing local name entry back"
3420 echo "to the normal namespace."
3423 check_mount_and_prep
3425 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3426 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3427 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3429 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3430 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3432 echo "Inject failure stub on MDT0 to simulate the case that"
3433 echo "foo's name entry will be removed, but the foo's object"
3434 echo "and its linkEA are kept in the system."
3436 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3438 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3441 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3442 error "(5) 'ls' should fail"
3444 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3445 $START_NAMESPACE -r -A ||
3446 error "(6) Fail to start LFSCK for namespace"
3448 wait_all_targets_blocked namespace completed 7
3450 local repaired=$($SHOW_NAMESPACE |
3451 awk '/^lost_dirent_repaired/ { print $2 }')
3452 [ $repaired -eq 1 ] ||
3453 error "(8) Fail to repair lost dirent: $repaired"
3455 ls -ail $DIR/$tdir/d0/foo ||
3456 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3458 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3459 [ "$foofid" == "$foofid2" ] ||
3460 error "(10) foo's FID changed: $foofid, $foofid2"
3462 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3465 [ $MDSCOUNT -lt 2 ] &&
3466 skip "We need at least 2 MDSes for this test" && return
3469 echo "The remote name entry back referenced by the MDT-object is lost."
3470 echo "The namespace LFSCK will add the missing remote name entry back"
3471 echo "to the normal namespace."
3474 check_mount_and_prep
3476 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3477 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3478 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3480 echo "Inject failure stub on MDT0 to simulate the case that"
3481 echo "foo's name entry will be removed, but the foo's object"
3482 echo "and its linkEA are kept in the system."
3484 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3485 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3486 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3487 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3489 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3490 error "(4) 'ls' should fail"
3492 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3493 $START_NAMESPACE -r -A ||
3494 error "(5) Fail to start LFSCK for namespace"
3496 wait_all_targets_blocked namespace completed 6
3498 local repaired=$($SHOW_NAMESPACE |
3499 awk '/^lost_dirent_repaired/ { print $2 }')
3500 [ $repaired -eq 1 ] ||
3501 error "(7) Fail to repair lost dirent: $repaired"
3503 ls -ail $DIR/$tdir/d0/foo ||
3504 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3506 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3507 [ "$foofid" == "$foofid2" ] ||
3508 error "(9) foo's FID changed: $foofid, $foofid2"
3510 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3514 echo "The local parent referenced by the MDT-object linkEA is lost."
3515 echo "The namespace LFSCK will re-create the lost parent as orphan."
3518 check_mount_and_prep
3520 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3521 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3522 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3523 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3525 echo "Inject failure stub on MDT0 to simulate the case that"
3526 echo "foo's name entry will be removed, but the foo's object"
3527 echo "and its linkEA are kept in the system. And then remove"
3528 echo "another hard link and the parent directory."
3530 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3531 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3532 rm -f $DIR/$tdir/d0/foo ||
3533 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3534 rm -f $DIR/$tdir/d0/dummy ||
3535 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3538 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3539 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
3541 echo "Trigger namespace LFSCK to repair the lost parent"
3542 $START_NAMESPACE -r -A ||
3543 error "(6) Fail to start LFSCK for namespace"
3545 wait_all_targets_blocked namespace completed 7
3547 local repaired=$($SHOW_NAMESPACE |
3548 awk '/^lost_dirent_repaired/ { print $2 }')
3549 [ $repaired -eq 1 ] ||
3550 error "(8) Fail to repair lost dirent: $repaired"
3552 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3553 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3554 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3556 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3558 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3559 [ ! -z "$cname" ] ||
3560 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3562 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3565 [ $MDSCOUNT -lt 2 ] &&
3566 skip "We need at least 2 MDSes for this test" && return
3569 echo "The remote parent referenced by the MDT-object linkEA is lost."
3570 echo "The namespace LFSCK will re-create the lost parent as orphan."
3573 check_mount_and_prep
3575 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3576 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3578 $LFS path2fid $DIR/$tdir/d0
3580 echo "Inject failure stub on MDT0 to simulate the case that"
3581 echo "foo's name entry will be removed, but the foo's object"
3582 echo "and its linkEA are kept in the system. And then remove"
3583 echo "the parent directory."
3585 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3586 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3587 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3588 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3590 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3591 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
3593 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3594 $START_NAMESPACE -r -A ||
3595 error "(6) Fail to start LFSCK for namespace"
3597 wait_all_targets_blocked namespace completed 7
3599 local repaired=$($SHOW_NAMESPACE |
3600 awk '/^lost_dirent_repaired/ { print $2 }')
3601 [ $repaired -eq 1 ] ||
3602 error "(8) Fail to repair lost dirent: $repaired"
3604 ls -ail $MOUNT/.lustre/lost+found/
3606 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3607 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3608 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3610 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3612 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3613 [ ! -z "$cname" ] ||
3614 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3616 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3619 [ $MDSCOUNT -lt 2 ] &&
3620 skip "The test needs at least 2 MDTs" && return
3623 echo "The target name entry is lost. The LFSCK should insert the"
3624 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3625 echo "the MDT (on which the orphan MDT-object resides) has ever"
3626 echo "failed to respond some name entry verification during the"
3627 echo "first stage-scanning, then the LFSCK should skip to handle"
3628 echo "orphan MDT-object on this MDT. But other MDTs should not"
3632 check_mount_and_prep
3633 $LFS mkdir -i 0 $DIR/$tdir/d1
3634 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3635 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3637 $LFS mkdir -i 1 $DIR/$tdir/d2
3638 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3639 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3641 echo "Inject failure stub on MDT0 to simulate the case that"
3642 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3643 echo "and its linkEA are kept in the system. And the case that"
3644 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3645 echo "and its linkEA are kept in the system."
3647 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3648 do_facet mds1 $LCTL set_param fail_loc=0x1624
3649 do_facet mds2 $LCTL set_param fail_loc=0x1624
3650 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3651 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3652 do_facet mds1 $LCTL set_param fail_loc=0
3653 do_facet mds2 $LCTL set_param fail_loc=0
3655 cancel_lru_locks mdc
3656 cancel_lru_locks osc
3658 echo "Inject failure, to simulate the MDT0 fail to handle"
3659 echo "MDT1 LFSCK request during the first-stage scanning."
3660 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3661 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3663 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3664 $START_NAMESPACE -r -A ||
3665 error "(3) Fail to start LFSCK for namespace"
3667 wait_update_facet mds1 "$LCTL get_param -n \
3668 mdd.$(facet_svc mds1).lfsck_namespace |
3669 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3670 error "(4) mds1 is not the expected 'partial'"
3673 wait_update_facet mds2 "$LCTL get_param -n \
3674 mdd.$(facet_svc mds2).lfsck_namespace |
3675 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3676 error "(5) mds2 is not the expected 'completed'"
3679 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3681 local repaired=$(do_facet mds1 $LCTL get_param -n \
3682 mdd.$(facet_svc mds1).lfsck_namespace |
3683 awk '/^lost_dirent_repaired/ { print $2 }')
3684 [ $repaired -eq 0 ] ||
3685 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3687 repaired=$(do_facet mds2 $LCTL get_param -n \
3688 mdd.$(facet_svc mds2).lfsck_namespace |
3689 awk '/^lost_dirent_repaired/ { print $2 }')
3690 [ $repaired -eq 1 ] ||
3691 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3693 echo "Trigger namespace LFSCK on all devices again to cleanup"
3694 $START_NAMESPACE -r -A ||
3695 error "(8) Fail to start LFSCK for namespace"
3697 wait_all_targets_blocked namespace completed 9
3699 local repaired=$(do_facet mds1 $LCTL get_param -n \
3700 mdd.$(facet_svc mds1).lfsck_namespace |
3701 awk '/^lost_dirent_repaired/ { print $2 }')
3702 [ $repaired -eq 1 ] ||
3703 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3705 repaired=$(do_facet mds2 $LCTL get_param -n \
3706 mdd.$(facet_svc mds2).lfsck_namespace |
3707 awk '/^lost_dirent_repaired/ { print $2 }')
3708 [ $repaired -eq 0 ] ||
3709 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3711 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3715 echo "The object's nlink attribute is larger than the object's known"
3716 echo "name entries count. The LFSCK will repair the object's nlink"
3717 echo "attribute to match the known name entries count"
3720 check_mount_and_prep
3722 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3723 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3725 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3726 echo "nlink attribute is larger than its name entries count."
3728 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3729 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3730 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3731 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3732 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3734 cancel_lru_locks mdc
3735 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3736 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3738 echo "Trigger namespace LFSCK to repair the nlink count"
3739 $START_NAMESPACE -r -A ||
3740 error "(5) Fail to start LFSCK for namespace"
3742 wait_all_targets_blocked namespace completed 6
3744 local repaired=$($SHOW_NAMESPACE |
3745 awk '/^nlinks_repaired/ { print $2 }')
3746 [ $repaired -eq 1 ] ||
3747 error "(7) Fail to repair nlink count: $repaired"
3749 cancel_lru_locks mdc
3750 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3751 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3753 run_test 29a "LFSCK can repair bad nlink count (1)"
3757 echo "The object's nlink attribute is smaller than the object's known"
3758 echo "name entries count. The LFSCK will repair the object's nlink"
3759 echo "attribute to match the known name entries count"
3762 check_mount_and_prep
3764 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3765 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3767 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3768 echo "nlink attribute is smaller than its name entries count."
3770 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3771 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3772 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3773 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3774 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3776 cancel_lru_locks mdc
3777 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3778 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3780 echo "Trigger namespace LFSCK to repair the nlink count"
3781 $START_NAMESPACE -r -A ||
3782 error "(5) Fail to start LFSCK for namespace"
3784 wait_all_targets_blocked namespace completed 6
3786 local repaired=$($SHOW_NAMESPACE |
3787 awk '/^nlinks_repaired/ { print $2 }')
3788 [ $repaired -eq 1 ] ||
3789 error "(7) Fail to repair nlink count: $repaired"
3791 cancel_lru_locks mdc
3792 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3793 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3795 run_test 29b "LFSCK can repair bad nlink count (2)"
3799 echo "There are too many hard links to the object, and exceeds the"
3800 echo "object's linkEA limitation, as to NOT all the known name entries"
3801 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3802 echo "skip the nlink verification for this object."
3805 check_mount_and_prep
3807 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3808 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3809 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3810 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3812 echo "Inject failure stub on MDT0 to simulate the case that"
3813 echo "foo's hard links exceed the object's linkEA limitation."
3815 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3816 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3818 cancel_lru_locks mdc
3820 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3821 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3823 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3824 $LFS fid2path $DIR $foofid
3825 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3826 [ $count2 -eq 2 ] || error "(6) Fail to inject error: $count2"
3828 echo "Trigger namespace LFSCK to repair the nlink count"
3829 $START_NAMESPACE -r -A ||
3830 error "(7) Fail to start LFSCK for namespace"
3832 wait_all_targets_blocked namespace completed 8
3834 local repaired=$($SHOW_NAMESPACE |
3835 awk '/^nlinks_repaired/ { print $2 }')
3836 [ $repaired -eq 0 ] ||
3837 error "(9) Repair nlink count unexpcetedly: $repaired"
3839 cancel_lru_locks mdc
3841 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3842 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3844 count2=$($LFS fid2path $DIR $foofid | wc -l)
3845 [ $count2 -eq 2 ] ||
3846 error "(11) Repaired something unexpectedly: $count2"
3848 # disable test_29c temporarily, it will be re-enabled in subsequent patch.
3849 #run_test 29c "Not verify nlink attr if hard links exceed linkEA limitation"
3852 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3853 skip "Only support backend /lost+found for ldiskfs" && return
3856 echo "The namespace LFSCK will move the orphans from backend"
3857 echo "/lost+found directory to normal client visible namespace"
3858 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3861 check_mount_and_prep
3863 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3864 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3866 echo "Inject failure stub on MDT0 to simulate the case that"
3867 echo "directory d0 has no linkEA entry, then the LFSCK will"
3868 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3870 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3872 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3873 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3875 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3876 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3878 echo "Inject failure stub on MDT0 to simulate the case that the"
3879 echo "object's name entry will be removed, but not destroy the"
3880 echo "object. Then backend e2fsck will handle it as orphan and"
3881 echo "add them into the backend /lost+found directory."
3883 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3884 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3885 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3886 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3887 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3888 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3891 umount_client $MOUNT || error "(10) Fail to stop client!"
3893 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3896 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3897 error "(12) Fail to run e2fsck"
3899 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3900 error "(13) Fail to start MDT0"
3902 echo "Trigger namespace LFSCK to recover backend orphans"
3903 $START_NAMESPACE -r -A ||
3904 error "(14) Fail to start LFSCK for namespace"
3906 wait_all_targets_blocked namespace completed 15
3908 local repaired=$($SHOW_NAMESPACE |
3909 awk '/^local_lost_found_moved/ { print $2 }')
3910 [ $repaired -ge 4 ] ||
3911 error "(16) Fail to recover backend orphans: $repaired"
3913 mount_client $MOUNT || error "(17) Fail to start client!"
3915 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
3917 ls -ail $MOUNT/.lustre/lost+found/
3919 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3920 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3921 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3923 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3925 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3926 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3928 stat ${cname}/d1 || error "(21) d0 is not recovered"
3929 stat ${cname}/f1 || error "(22) f1 is not recovered"
3931 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3934 [ $MDSCOUNT -lt 2 ] &&
3935 skip "The test needs at least 2 MDTs" && return
3938 echo "For the name entry under a striped directory, if the name"
3939 echo "hash does not match the shard, then the LFSCK will repair"
3940 echo "the bad name entry"
3943 check_mount_and_prep
3945 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3946 error "(1) Fail to create striped directory"
3948 echo "Inject failure stub on client to simulate the case that"
3949 echo "some name entry should be inserted into other non-first"
3950 echo "shard, but inserted into the first shard by wrong"
3952 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3953 $LCTL set_param fail_loc=0x1628 fail_val=0
3954 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3955 error "(2) Fail to create file under striped directory"
3956 $LCTL set_param fail_loc=0 fail_val=0
3958 echo "Trigger namespace LFSCK to repair bad name hash"
3959 $START_NAMESPACE -r -A ||
3960 error "(3) Fail to start LFSCK for namespace"
3962 wait_all_targets_blocked namespace completed 4
3964 local repaired=$($SHOW_NAMESPACE |
3965 awk '/^name_hash_repaired/ { print $2 }')
3966 [ $repaired -ge 1 ] ||
3967 error "(5) Fail to repair bad name hash: $repaired"
3969 umount_client $MOUNT || error "(6) umount failed"
3970 mount_client $MOUNT || error "(7) mount failed"
3972 for ((i = 0; i < $MDSCOUNT; i++)); do
3973 stat $DIR/$tdir/striped_dir/d$i ||
3974 error "(8) Fail to stat d$i after LFSCK"
3975 rmdir $DIR/$tdir/striped_dir/d$i ||
3976 error "(9) Fail to unlink d$i after LFSCK"
3979 rmdir $DIR/$tdir/striped_dir ||
3980 error "(10) Fail to remove the striped directory after LFSCK"
3982 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3985 [ $MDSCOUNT -lt 2 ] &&
3986 skip "The test needs at least 2 MDTs" && return
3989 echo "For the name entry under a striped directory, if the name"
3990 echo "hash does not match the shard, then the LFSCK will repair"
3991 echo "the bad name entry"
3994 check_mount_and_prep
3996 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3997 error "(1) Fail to create striped directory"
3999 echo "Inject failure stub on client to simulate the case that"
4000 echo "some name entry should be inserted into other non-second"
4001 echo "shard, but inserted into the secod shard by wrong"
4003 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4004 $LCTL set_param fail_loc=0x1628 fail_val=1
4005 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4006 error "(2) Fail to create file under striped directory"
4007 $LCTL set_param fail_loc=0 fail_val=0
4009 echo "Trigger namespace LFSCK to repair bad name hash"
4010 $START_NAMESPACE -r -A ||
4011 error "(3) Fail to start LFSCK for namespace"
4013 wait_all_targets_blocked namespace completed 4
4015 local repaired=$(do_facet mds2 $LCTL get_param -n \
4016 mdd.$(facet_svc mds2).lfsck_namespace |
4017 awk '/^name_hash_repaired/ { print $2 }')
4018 [ $repaired -ge 1 ] ||
4019 error "(5) Fail to repair bad name hash: $repaired"
4021 umount_client $MOUNT || error "(6) umount failed"
4022 mount_client $MOUNT || error "(7) mount failed"
4024 for ((i = 0; i < $MDSCOUNT; i++)); do
4025 stat $DIR/$tdir/striped_dir/d$i ||
4026 error "(8) Fail to stat d$i after LFSCK"
4027 rmdir $DIR/$tdir/striped_dir/d$i ||
4028 error "(9) Fail to unlink d$i after LFSCK"
4031 rmdir $DIR/$tdir/striped_dir ||
4032 error "(10) Fail to remove the striped directory after LFSCK"
4034 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4037 [ $MDSCOUNT -lt 2 ] &&
4038 skip "The test needs at least 2 MDTs" && return
4041 echo "For some reason, the master MDT-object of the striped directory"
4042 echo "may lost its master LMV EA. If nobody created files under the"
4043 echo "master directly after the master LMV EA lost, then the LFSCK"
4044 echo "should re-generate the master LMV EA."
4047 check_mount_and_prep
4049 echo "Inject failure stub on MDT0 to simulate the case that the"
4050 echo "master MDT-object of the striped directory lost the LMV EA."
4052 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4053 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4054 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4055 error "(1) Fail to create striped directory"
4056 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4058 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4059 $START_NAMESPACE -r -A ||
4060 error "(2) Fail to start LFSCK for namespace"
4062 wait_all_targets_blocked namespace completed 3
4064 local repaired=$($SHOW_NAMESPACE |
4065 awk '/^striped_dirs_repaired/ { print $2 }')
4066 [ $repaired -eq 1 ] ||
4067 error "(4) Fail to re-generate master LMV EA: $repaired"
4069 umount_client $MOUNT || error "(5) umount failed"
4070 mount_client $MOUNT || error "(6) mount failed"
4072 local empty=$(ls $DIR/$tdir/striped_dir/)
4073 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4075 rmdir $DIR/$tdir/striped_dir ||
4076 error "(8) Fail to remove the striped directory after LFSCK"
4078 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4081 [ $MDSCOUNT -lt 2 ] &&
4082 skip "The test needs at least 2 MDTs" && return
4085 echo "For some reason, the master MDT-object of the striped directory"
4086 echo "may lost its master LMV EA. If somebody created files under the"
4087 echo "master directly after the master LMV EA lost, then the LFSCK"
4088 echo "should NOT re-generate the master LMV EA, instead, it should"
4089 echo "change the broken striped dirctory as read-only to prevent"
4090 echo "further damage"
4093 check_mount_and_prep
4095 echo "Inject failure stub on MDT0 to simulate the case that the"
4096 echo "master MDT-object of the striped directory lost the LMV EA."
4098 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4100 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4101 error "(1) Fail to create striped directory"
4102 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4104 umount_client $MOUNT || error "(2) umount failed"
4105 mount_client $MOUNT || error "(3) mount failed"
4107 touch $DIR/$tdir/striped_dir/dummy ||
4108 error "(4) Fail to touch under broken striped directory"
4110 echo "Trigger namespace LFSCK to find out the inconsistency"
4111 $START_NAMESPACE -r -A ||
4112 error "(5) Fail to start LFSCK for namespace"
4114 wait_all_targets_blocked namespace completed 6
4116 local repaired=$($SHOW_NAMESPACE |
4117 awk '/^striped_dirs_repaired/ { print $2 }')
4118 [ $repaired -eq 0 ] ||
4119 error "(7) Re-generate master LMV EA unexpected: $repaired"
4121 stat $DIR/$tdir/striped_dir/dummy ||
4122 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4124 touch $DIR/$tdir/striped_dir/foo &&
4125 error "(9) The broken striped directory should be read-only"
4127 chattr -i $DIR/$tdir/striped_dir ||
4128 error "(10) Fail to chattr on the broken striped directory"
4130 rmdir $DIR/$tdir/striped_dir ||
4131 error "(11) Fail to remove the striped directory after LFSCK"
4133 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4136 [ $MDSCOUNT -lt 2 ] &&
4137 skip "The test needs at least 2 MDTs" && return
4140 echo "For some reason, the slave MDT-object of the striped directory"
4141 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4142 echo "slave LMV EA."
4145 check_mount_and_prep
4147 echo "Inject failure stub on MDT0 to simulate the case that the"
4148 echo "slave MDT-object (that resides on the same MDT as the master"
4149 echo "MDT-object resides on) lost the LMV EA."
4151 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4152 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4153 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4154 error "(1) Fail to create striped directory"
4155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4157 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4158 $START_NAMESPACE -r -A ||
4159 error "(2) Fail to start LFSCK for namespace"
4161 wait_all_targets_blocked namespace completed 3
4163 local repaired=$($SHOW_NAMESPACE |
4164 awk '/^striped_shards_repaired/ { print $2 }')
4165 [ $repaired -eq 1 ] ||
4166 error "(4) Fail to re-generate slave LMV EA: $repaired"
4168 rmdir $DIR/$tdir/striped_dir ||
4169 error "(5) Fail to remove the striped directory after LFSCK"
4171 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4174 [ $MDSCOUNT -lt 2 ] &&
4175 skip "The test needs at least 2 MDTs" && return
4178 echo "For some reason, the slave MDT-object of the striped directory"
4179 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4180 echo "slave LMV EA."
4183 check_mount_and_prep
4185 echo "Inject failure stub on MDT0 to simulate the case that the"
4186 echo "slave MDT-object (that resides on different MDT as the master"
4187 echo "MDT-object resides on) lost the LMV EA."
4189 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4191 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4192 error "(1) Fail to create striped directory"
4193 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4195 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4196 $START_NAMESPACE -r -A ||
4197 error "(2) Fail to start LFSCK for namespace"
4199 wait_all_targets_blocked namespace completed 3
4201 local repaired=$(do_facet mds2 $LCTL get_param -n \
4202 mdd.$(facet_svc mds2).lfsck_namespace |
4203 awk '/^striped_shards_repaired/ { print $2 }')
4204 [ $repaired -eq 1 ] ||
4205 error "(4) Fail to re-generate slave LMV EA: $repaired"
4207 rmdir $DIR/$tdir/striped_dir ||
4208 error "(5) Fail to remove the striped directory after LFSCK"
4210 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4213 [ $MDSCOUNT -lt 2 ] &&
4214 skip "The test needs at least 2 MDTs" && return
4217 echo "For some reason, the stripe index in the slave LMV EA is"
4218 echo "corrupted. The LFSCK should repair the slave LMV EA."
4221 check_mount_and_prep
4223 echo "Inject failure stub on MDT0 to simulate the case that the"
4224 echo "slave LMV EA on the first shard of the striped directory"
4225 echo "claims the same index as the second shard claims"
4227 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4228 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4229 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4230 error "(1) Fail to create striped directory"
4231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4233 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4234 $START_NAMESPACE -r -A ||
4235 error "(2) Fail to start LFSCK for namespace"
4237 wait_all_targets_blocked namespace completed 3
4239 local repaired=$($SHOW_NAMESPACE |
4240 awk '/^striped_shards_repaired/ { print $2 }')
4241 [ $repaired -eq 1 ] ||
4242 error "(4) Fail to repair slave LMV EA: $repaired"
4244 umount_client $MOUNT || error "(5) umount failed"
4245 mount_client $MOUNT || error "(6) mount failed"
4247 touch $DIR/$tdir/striped_dir/foo ||
4248 error "(7) Fail to touch file after the LFSCK"
4250 rm -f $DIR/$tdir/striped_dir/foo ||
4251 error "(8) Fail to unlink file after the LFSCK"
4253 rmdir $DIR/$tdir/striped_dir ||
4254 error "(9) Fail to remove the striped directory after LFSCK"
4256 run_test 31g "Repair the corrupted slave LMV EA"
4259 [ $MDSCOUNT -lt 2 ] &&
4260 skip "The test needs at least 2 MDTs" && return
4263 echo "For some reason, the shard's name entry in the striped"
4264 echo "directory may be corrupted. The LFSCK should repair the"
4265 echo "bad shard's name entry."
4268 check_mount_and_prep
4270 echo "Inject failure stub on MDT0 to simulate the case that the"
4271 echo "first shard's name entry in the striped directory claims"
4272 echo "the same index as the second shard's name entry claims."
4274 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4276 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4277 error "(1) Fail to create striped directory"
4278 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4280 echo "Trigger namespace LFSCK to repair the shard's name entry"
4281 $START_NAMESPACE -r -A ||
4282 error "(2) Fail to start LFSCK for namespace"
4284 wait_all_targets_blocked namespace completed 3
4286 local repaired=$($SHOW_NAMESPACE |
4287 awk '/^dirent_repaired/ { print $2 }')
4288 [ $repaired -eq 1 ] ||
4289 error "(4) Fail to repair shard's name entry: $repaired"
4291 umount_client $MOUNT || error "(5) umount failed"
4292 mount_client $MOUNT || error "(6) mount failed"
4294 touch $DIR/$tdir/striped_dir/foo ||
4295 error "(7) Fail to touch file after the LFSCK"
4297 rm -f $DIR/$tdir/striped_dir/foo ||
4298 error "(8) Fail to unlink file after the LFSCK"
4300 rmdir $DIR/$tdir/striped_dir ||
4301 error "(9) Fail to remove the striped directory after LFSCK"
4303 run_test 31h "Repair the corrupted shard's name entry"
4308 umount_client $MOUNT
4310 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4311 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4312 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4314 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4315 [ "$STATUS" == "scanning-phase1" ] ||
4316 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4319 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4321 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4325 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
4327 run_test 32 "stop LFSCK when some OST failed"
4329 # restore MDS/OST size
4330 MDSSIZE=${SAVED_MDSSIZE}
4331 OSTSIZE=${SAVED_OSTSIZE}
4332 OSTCOUNT=${SAVED_OSTCOUNT}
4334 # cleanup the system at last