3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
19 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
21 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
24 require_dsh_mds || exit 0
28 if ! check_versions; then
29 skip "It is NOT necessary to test lfsck under interoperation mode"
33 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
34 skip "Need MDS version at least 2.3.60"
38 SAVED_MDSSIZE=${MDSSIZE}
39 SAVED_OSTSIZE=${OSTSIZE}
40 SAVED_OSTCOUNT=${OSTCOUNT}
41 # use small MDS + OST size to speed formatting time
42 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
46 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
48 # no need too many OSTs, to reduce the format/start/stop overhead
50 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
52 # build up a clean test environment.
53 REFORMAT="yes" check_and_setup_lustre
55 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
56 OST_DEV="${FSNAME}-OST0000"
57 START_NAMESPACE="do_facet $SINGLEMDS \
58 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
59 START_LAYOUT="do_facet $SINGLEMDS \
60 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
61 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
62 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
63 SHOW_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
65 SHOW_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
67 SHOW_LAYOUT_ON_OST="do_facet ost1 \
68 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
69 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
70 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
71 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
111 local dev=$(facet_device $facet)
113 start $facet $dev $opts > /dev/null ||
114 error "($err) Fail to start $facet!"
117 run_e2fsck_on_mds_facet() {
118 [ $mds1_FSTYPE == ldiskfs ] || return 0
122 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
123 local host=$(facet_active_host $mds)
124 local dev=$(facet_device $mds)
126 run_e2fsck $host $dev "-n" |
128 run_e2fsck $host $dev "-n"
129 error "(2) Detected inconsistency on $mds"
131 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
134 wait_all_targets_blocked() {
138 # wait to simulate blocked wait, so that we can know the status
139 local timeout=${4:-600}
140 local lfsck_query="$LCTL lfsck_query -t $com -M $FSNAME-MDT0000"
142 wait_update_facet --quiet mds1 \
143 "$lfsck_query | awk '/^${com}_mdts_$status/ { print \\\$2 }'" \
144 "$MDSCOUNT" $timeout || {
145 local mdts=$(comma_list $(mdts_nodes))
146 local count=$(do_facet mds1 "$lfsck_query" |
147 awk '/^${com}_mdts_$status/ { print $2 }')
149 do_facet mds1 "$lfsck_query"
150 echo "==== MDT LOGS ===="
151 do_nodes $mdts "$LCTL get_param mdd.*.lfsck_$com"
152 do_nodes $mdts "$LCTL get_param osd*.*.oi_scrub"
153 if [[ "$com" == "layout" ]]; then
154 local osts=$(comma_list $(osts_nodes))
155 echo "==== OST LOGS ===="
157 do_nodes $osts "$LCTL get_param obdfilter.*.lfsck_$com"
158 do_nodes $osts "$LCTL get_param osd*.*.oi_scrub"
162 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
171 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
172 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
173 "$MDSCOUNT" $LTIME || {
174 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
175 error "($err) some MDTs are not in ${status}"
182 #define OBD_FAIL_LFSCK_DELAY1 0x1600
183 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
184 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
186 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
188 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
189 [ "$STATUS" == "scanning-phase1" ] ||
190 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
192 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
194 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
195 [ "$STATUS" == "stopped" ] ||
196 error "(6) Expect 'stopped', but got '$STATUS'"
198 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
200 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
201 [ "$STATUS" == "scanning-phase1" ] ||
202 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
205 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
206 mdd.${MDT_DEV}.lfsck_namespace |
207 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
209 error "(9) unexpected status"
212 local repaired=$($SHOW_NAMESPACE |
213 awk '/^updated_phase1/ { print $2 }')
214 [ $repaired -eq 0 ] ||
215 error "(10) Expect nothing to be repaired, but got: $repaired"
217 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
218 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
219 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
220 mdd.${MDT_DEV}.lfsck_namespace |
221 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
223 error "(12) unexpected status"
226 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
227 [ $((scanned1 + 1)) -eq $scanned2 ] ||
228 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
230 echo "stopall, should NOT crash LU-3649"
231 stopall || error "(14) Fail to stopall"
233 run_test 0 "Control LFSCK manually"
238 #define OBD_FAIL_FID_INDIR 0x1501
239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
240 touch $DIR/$tdir/dummy
242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
244 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
245 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
246 mdd.${MDT_DEV}.lfsck_namespace |
247 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
249 error "(4) unexpected status"
252 local repaired=$($SHOW_NAMESPACE |
253 awk '/^dirent_repaired/ { print $2 }')
254 # for interop with old server
255 [ -z "$repaired" ] &&
256 repaired=$($SHOW_NAMESPACE |
257 awk '/^updated_phase1/ { print $2 }')
259 [ $repaired -eq 1 ] ||
260 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
262 run_e2fsck_on_mds_facet $SINGLEMDS
264 mount_client $MOUNT || error "(6) Fail to start client!"
266 #define OBD_FAIL_FID_LOOKUP 0x1505
267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
268 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
272 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
276 [ "$mds1_FSTYPE" != ldiskfs ] &&
277 skip "OI Scrub not implemented for ZFS"
281 #define OBD_FAIL_FID_INLMA 0x1502
282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
283 touch $DIR/$tdir/dummy
285 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
287 #define OBD_FAIL_FID_NOLMA 0x1506
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
289 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
290 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
291 mdd.${MDT_DEV}.lfsck_namespace |
292 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
294 error "(4) unexpected status"
297 local repaired=$($SHOW_NAMESPACE |
298 awk '/^dirent_repaired/ { print $2 }')
299 # for interop with old server
300 [ -z "$repaired" ] &&
301 repaired=$($SHOW_NAMESPACE |
302 awk '/^updated_phase1/ { print $2 }')
304 [ $repaired -eq 1 ] ||
305 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
308 run_e2fsck_on_mds_facet $SINGLEMDS
310 mount_client $MOUNT || error "(6) Fail to start client!"
312 #define OBD_FAIL_FID_LOOKUP 0x1505
313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
314 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
318 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
323 #define OBD_FAIL_FID_IGIF 0x1504
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
325 touch $DIR/$tdir/dummy
327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
329 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
330 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
331 mdd.${MDT_DEV}.lfsck_namespace |
332 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
334 error "(4) unexpected status"
337 local repaired=$($SHOW_NAMESPACE |
338 awk '/^dirent_repaired/ { print $2 }')
339 # for interop with old server
340 [ -z "$repaired" ] &&
341 repaired=$($SHOW_NAMESPACE |
342 awk '/^updated_phase1/ { print $2 }')
344 [ $repaired -eq 1 ] ||
345 error "(5) Fail to repair lost FID-in-dirent: $repaired"
347 run_e2fsck_on_mds_facet $SINGLEMDS
349 mount_client $MOUNT || error "(6) Fail to start client!"
351 #define OBD_FAIL_FID_LOOKUP 0x1505
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
353 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
357 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
362 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
364 touch $DIR/$tdir/dummy
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
368 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
369 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
370 mdd.${MDT_DEV}.lfsck_namespace |
371 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
373 error "(4) unexpected status"
376 local repaired=$($SHOW_NAMESPACE |
377 awk '/^linkea_repaired/ { print $2 }')
378 # for interop with old server
379 [ -z "$repaired" ] &&
380 repaired=$($SHOW_NAMESPACE |
381 awk '/^updated_phase2/ { print $2 }')
383 [ $repaired -eq 1 ] ||
384 error "(5) Fail to repair crashed linkEA: $repaired"
386 run_e2fsck_on_mds_facet $SINGLEMDS
388 mount_client $MOUNT || error "(6) Fail to start client!"
390 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
391 error "(7) Fail to stat $DIR/$tdir/dummy"
393 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
394 local dummyname=$($LFS fid2path $DIR $dummyfid)
395 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
396 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
398 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
404 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
406 touch $DIR/$tdir/dummy
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
410 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
412 mdd.${MDT_DEV}.lfsck_namespace |
413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
415 error "(4) unexpected status"
418 local repaired=$($SHOW_NAMESPACE |
419 awk '/^updated_phase2/ { print $2 }')
420 [ $repaired -eq 1 ] ||
421 error "(5) Fail to repair crashed linkEA: $repaired"
423 run_e2fsck_on_mds_facet $SINGLEMDS
425 mount_client $MOUNT || error "(6) Fail to start client!"
427 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
428 error "(7) Fail to stat $DIR/$tdir/dummy"
430 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
431 local dummyname=$($LFS fid2path $DIR $dummyfid)
432 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
433 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
435 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
439 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
440 skip "MDS older than 2.4.90"
444 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
445 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
446 touch $DIR/$tdir/dummy
448 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
450 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
451 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
452 mdd.${MDT_DEV}.lfsck_namespace |
453 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
455 error "(4) unexpected status"
458 local repaired=$($SHOW_NAMESPACE |
459 awk '/^updated_phase2/ { print $2 }')
460 [ $repaired -eq 1 ] ||
461 error "(5) Fail to repair crashed linkEA: $repaired"
463 run_e2fsck_on_mds_facet $SINGLEMDS
465 mount_client $MOUNT || error "(6) Fail to start client!"
467 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
468 error "(7) Fail to stat $DIR/$tdir/dummy"
470 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
471 local dummyname=$($LFS fid2path $DIR $dummyfid)
472 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
473 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
475 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
479 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
480 skip "MDS older than 2.6.50, LU-4788"
484 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
485 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
486 touch $DIR/$tdir/dummy
488 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
490 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
491 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
492 mdd.${MDT_DEV}.lfsck_namespace |
493 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
495 error "(4) unexpected status"
498 local repaired=$($SHOW_NAMESPACE |
499 awk '/^linkea_repaired/ { print $2 }')
500 [ $repaired -eq 1 ] ||
501 error "(5) Fail to repair crashed linkEA: $repaired"
503 run_e2fsck_on_mds_facet $SINGLEMDS
505 mount_client $MOUNT || error "(6) Fail to start client!"
507 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
508 error "(7) Fail to stat $DIR/$tdir/dummy"
510 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
511 local dummyname=$($LFS fid2path $DIR $dummyfid)
512 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
513 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
515 run_test 2d "LFSCK can recover the missing linkEA entry"
519 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
520 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
521 skip "MDS older than 2.6.50, LU-5511"
525 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
527 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
528 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
529 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
532 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
534 wait_all_targets_blocked namespace completed 4
536 local repaired=$($SHOW_NAMESPACE |
537 awk '/^linkea_repaired/ { print $2 }')
538 [ $repaired -eq 1 ] ||
539 error "(5) Fail to repair crashed linkEA: $repaired"
541 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
542 local name=$($LFS fid2path $DIR $fid)
543 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
544 error "(6) Fail to repair linkEA: $fid $name"
546 run_test 2e "namespace LFSCK can verify remote object linkEA"
550 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
551 skip "MDS older than 2.6.50, LU-4788"
555 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
556 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
557 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
559 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
560 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
561 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
563 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
564 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
565 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
567 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
569 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
571 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
573 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
574 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
575 mdd.${MDT_DEV}.lfsck_namespace |
576 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
578 error "(10) unexpected status"
581 local checked=$($SHOW_NAMESPACE |
582 awk '/^checked_phase2/ { print $2 }')
583 [ $checked -ge 4 ] ||
584 error "(11) Fail to check multiple-linked object: $checked"
586 local repaired=$($SHOW_NAMESPACE |
587 awk '/^multiple_linked_repaired/ { print $2 }')
588 [ $repaired -ge 2 ] ||
589 error "(12) Fail to repair multiple-linked object: $repaired"
591 run_test 3 "LFSCK can verify multiple-linked objects"
595 [ "$mds1_FSTYPE" != ldiskfs ] &&
596 skip "OI Scrub not implemented for ZFS"
599 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
600 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
602 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
603 echo "start $SINGLEMDS with disabling OI scrub"
604 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
606 #define OBD_FAIL_LFSCK_DELAY2 0x1601
607 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
608 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
609 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
610 mdd.${MDT_DEV}.lfsck_namespace |
611 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
613 error "(5) unexpected status"
616 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
617 [ "$STATUS" == "scanning-phase1" ] ||
618 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
620 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
622 mdd.${MDT_DEV}.lfsck_namespace |
623 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
625 error "(7) unexpected status"
628 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
629 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^dirent_repaired/ { print $2 }')
633 # for interop with old server
634 [ -z "$repaired" ] &&
635 repaired=$($SHOW_NAMESPACE |
636 awk '/^updated_phase1/ { print $2 }')
638 [ $repaired -ge 9 ] ||
639 error "(9) Fail to re-generate FID-in-dirent: $repaired"
641 run_e2fsck_on_mds_facet $SINGLEMDS
643 mount_client $MOUNT || error "(10) Fail to start client!"
645 #define OBD_FAIL_FID_LOOKUP 0x1505
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
647 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
648 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
650 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
654 [ "$mds1_FSTYPE" != ldiskfs ] &&
655 skip "OI Scrub not implemented for ZFS"
658 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
659 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
661 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
662 echo "start $SINGLEMDS with disabling OI scrub"
663 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
665 #define OBD_FAIL_LFSCK_DELAY2 0x1601
666 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
667 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
668 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
669 mdd.${MDT_DEV}.lfsck_namespace |
670 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
672 error "(5) unexpected status"
675 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
676 [ "$STATUS" == "scanning-phase1" ] ||
677 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
680 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
681 mdd.${MDT_DEV}.lfsck_namespace |
682 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
684 error "(7) unexpected status"
687 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
688 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
690 local repaired=$($SHOW_NAMESPACE |
691 awk '/^dirent_repaired/ { print $2 }')
692 # for interop with old server
693 [ -z "$repaired" ] &&
694 repaired=$($SHOW_NAMESPACE |
695 awk '/^updated_phase1/ { print $2 }')
697 [ $repaired -ge 2 ] ||
698 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
700 run_e2fsck_on_mds_facet $SINGLEMDS
702 mount_client $MOUNT || error "(10) Fail to start client!"
704 #define OBD_FAIL_FID_LOOKUP 0x1505
705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
706 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
708 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
710 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
711 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
712 local dummyname=$($LFS fid2path $DIR $dummyfid)
713 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
714 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
716 run_test 5 "LFSCK can handle IGIF object upgrading"
721 #define OBD_FAIL_LFSCK_DELAY1 0x1600
722 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
723 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
725 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
726 [ "$STATUS" == "scanning-phase1" ] ||
727 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
729 # Sleep 3 sec to guarantee at least one object processed by LFSCK
731 # Fail the LFSCK to guarantee there is at least one checkpoint
732 #define OBD_FAIL_LFSCK_FATAL1 0x1608
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
738 error "(4) unexpected status"
741 local POS0=$($SHOW_NAMESPACE |
742 awk '/^last_checkpoint_position/ { print $2 }' |
745 #define OBD_FAIL_LFSCK_DELAY1 0x1600
746 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
747 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
749 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
750 [ "$STATUS" == "scanning-phase1" ] ||
751 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
753 local POS1=$($SHOW_NAMESPACE |
754 awk '/^latest_start_position/ { print $2 }' |
756 [[ $POS0 -lt $POS1 ]] ||
757 error "(7) Expect larger than: $POS0, but got $POS1"
759 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
760 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
761 mdd.${MDT_DEV}.lfsck_namespace |
762 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
764 error "(8) unexpected status"
767 run_test 6a "LFSCK resumes from last checkpoint (1)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 5 sec to guarantee that we are in the directory scanning
782 # Fail the LFSCK to guarantee there is at least one checkpoint
783 #define OBD_FAIL_LFSCK_FATAL2 0x1609
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
785 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
786 mdd.${MDT_DEV}.lfsck_namespace |
787 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
789 error "(4) unexpected status"
792 local O_POS0=$($SHOW_NAMESPACE |
793 awk '/^last_checkpoint_position/ { print $2 }' |
796 local D_POS0=$($SHOW_NAMESPACE |
797 awk '/^last_checkpoint_position/ { print $4 }')
799 #define OBD_FAIL_LFSCK_DELAY2 0x1601
800 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
801 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
803 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
804 [ "$STATUS" == "scanning-phase1" ] ||
805 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
807 local O_POS1=$($SHOW_NAMESPACE |
808 awk '/^latest_start_position/ { print $2 }' |
810 local D_POS1=$($SHOW_NAMESPACE |
811 awk '/^latest_start_position/ { print $4 }')
813 echo "Additional debug for 6b"
815 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
816 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
817 [[ $O_POS0 -lt $O_POS1 ]] ||
818 error "(7.1) $O_POS1 is not larger than $O_POS0"
820 [[ $D_POS0 -lt $D_POS1 ]] ||
821 error "(7.2) $D_POS1 is not larger than $D_POS0"
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
829 error "(8) unexpected status"
832 run_test 6b "LFSCK resumes from last checkpoint (2)"
839 #define OBD_FAIL_LFSCK_DELAY2 0x1601
840 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
841 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
843 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
844 [ "$STATUS" == "scanning-phase1" ] ||
845 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
847 # Sleep 3 sec to guarantee at least one object processed by LFSCK
849 echo "stop $SINGLEMDS"
850 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
853 echo "start $SINGLEMDS"
854 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
856 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
857 mdd.${MDT_DEV}.lfsck_namespace |
858 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
860 error "(6) unexpected status"
863 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
869 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
871 for ((i = 0; i < 20; i++)); do
872 touch $DIR/$tdir/dummy${i}
875 #define OBD_FAIL_LFSCK_DELAY3 0x1602
876 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
877 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
878 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
879 mdd.${MDT_DEV}.lfsck_namespace |
880 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
882 error "(4) unexpected status"
886 echo "stop $SINGLEMDS"
887 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
890 echo "start $SINGLEMDS"
891 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
893 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
894 mdd.${MDT_DEV}.lfsck_namespace |
895 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
897 error "(7) unexpected status"
900 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
911 formatall > /dev/null
917 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
918 [ "$STATUS" == "init" ] ||
919 namespace_error "(2) Expect 'init', but got '$STATUS'"
921 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
923 mkdir $DIR/$tdir/crashed
925 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
927 for ((i = 0; i < 5; i++)); do
928 touch $DIR/$tdir/dummy${i}
931 umount_client $MOUNT || error "(3) Fail to stop client!"
933 #define OBD_FAIL_LFSCK_DELAY2 0x1601
934 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
936 namespace_error "(4) Fail to start LFSCK for namespace!"
938 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
939 [ "$STATUS" == "scanning-phase1" ] ||
940 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
942 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
944 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
945 [ "$STATUS" == "stopped" ] ||
946 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
949 namespace_error "(8) Fail to start LFSCK for namespace!"
951 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
952 [ "$STATUS" == "scanning-phase1" ] ||
953 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
955 #define OBD_FAIL_LFSCK_FATAL2 0x1609
956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
957 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
958 mdd.${MDT_DEV}.lfsck_namespace |
959 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
961 namespace_error "(10) unexpected status"
964 #define OBD_FAIL_LFSCK_DELAY1 0x1600
965 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
967 namespace_error "(11) Fail to start LFSCK for namespace!"
969 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
970 [ "$STATUS" == "scanning-phase1" ] ||
971 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
973 #define OBD_FAIL_LFSCK_CRASH 0x160a
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
977 echo "stop $SINGLEMDS"
978 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
980 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
981 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
983 echo "start $SINGLEMDS"
984 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
986 local timeout=$(max_recovery_time)
989 while [ $timer -lt $timeout ]; do
990 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
991 mdt.${MDT_DEV}.recovery_status |
992 awk '/^status/ { print \\\$2 }'")
993 [ "$STATUS" != "RECOVERING" ] && break;
998 [ $timer != $timeout ] || (
999 do_facet $SINGLEMDS "$LCTL get_param -n \
1000 mdt.${MDT_DEV}.recovery_status"
1001 error "(14.1) recovery timeout"
1004 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1005 [ "$STATUS" == "crashed" ] ||
1006 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1008 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1011 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1012 mdd.${MDT_DEV}.lfsck_namespace |
1013 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
1015 namespace_error "(17) unexpected status"
1018 echo "stop $SINGLEMDS"
1019 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1021 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1024 echo "start $SINGLEMDS"
1025 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1028 while [ $timer -lt $timeout ]; do
1029 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1030 mdt.${MDT_DEV}.recovery_status |
1031 awk '/^status/ { print \\\$2 }'")
1032 [ "$STATUS" != "RECOVERING" ] && break;
1034 timer=$((timer + 1))
1037 [ $timer != $timeout ] || (
1038 do_facet $SINGLEMDS "$LCTL get_param -n \
1039 mdt.${MDT_DEV}.recovery_status"
1040 error "(19.1) recovery timeout"
1043 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1044 [ "$STATUS" == "paused" ] ||
1045 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1047 echo "stop $SINGLEMDS"
1048 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1050 echo "start $SINGLEMDS without resume LFSCK"
1051 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1054 while [ $timer -lt $timeout ]; do
1055 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1056 mdt.${MDT_DEV}.recovery_status |
1057 awk '/^status/ { print \\\$2 }'")
1058 [ "$STATUS" != "RECOVERING" ] && break;
1060 timer=$((timer + 1))
1063 [ $timer != $timeout ] || (
1064 do_facet $SINGLEMDS "$LCTL get_param -n \
1065 mdt.${MDT_DEV}.recovery_status"
1066 error "(20.3) recovery timeout"
1069 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1070 [ "$STATUS" == "paused" ] ||
1071 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1073 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1074 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1077 namespace_error "(21) Fail to start LFSCK for namespace!"
1078 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1079 mdd.${MDT_DEV}.lfsck_namespace |
1080 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1082 namespace_error "(22) unexpected status"
1085 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1086 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1087 mdd.${MDT_DEV}.lfsck_namespace |
1088 awk '/^flags/ { print \\\$2 }'" "scanned-once,inconsistent" 32 || {
1090 namespace_error "(23) Expect 'scanned-once,inconsistent'"
1093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1095 mdd.${MDT_DEV}.lfsck_namespace |
1096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1098 namespace_error "(24) unexpected status"
1101 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1103 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1105 run_test 8 "LFSCK state machine"
1108 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1109 skip "Testing on UP system, the speed may be inaccurate."
1113 check_mount_and_prep
1114 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1115 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1116 createmany -o $DIR/$tdir/lfsck/f 5000
1118 local BASE_SPEED1=100
1120 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1123 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1124 [ "$STATUS" == "scanning-phase1" ] ||
1125 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1127 local SPEED=$($SHOW_LAYOUT |
1128 awk '/^average_speed_phase1/ { print $2 }')
1130 # There may be time error, normally it should be less than 2 seconds.
1131 # We allow another 20% schedule error.
1133 # MAX_MARGIN = 1.3 = 13 / 10
1134 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1135 RUN_TIME1 * 13 / 10))
1136 [ $SPEED -lt $MAX_SPEED ] || {
1138 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1139 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1142 # adjust speed limit
1143 local BASE_SPEED2=300
1145 do_facet $SINGLEMDS \
1146 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1149 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1150 # MIN_MARGIN = 0.7 = 7 / 10
1151 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1152 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1153 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1154 [ $SPEED -gt $MIN_SPEED ] || {
1155 if [ $mds1_FSTYPE != ldiskfs ]; then
1156 error_ignore LU-5624 \
1157 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1160 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1164 # MAX_MARGIN = 1.3 = 13 / 10
1165 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1166 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1167 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1168 [ $SPEED -lt $MAX_SPEED ] || {
1170 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1171 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1172 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1175 do_nodes $(tgts_nodes) $LCTL set_param -n *.*.lfsck_speed_limit=0
1177 wait_update_facet $SINGLEMDS \
1178 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1179 awk '/^status/ { print \\\$2 }'" "completed" ||
1180 error "(7) Failed to get expected 'completed'"
1182 run_test 9a "LFSCK speed control (1)"
1185 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1186 skip "Testing on UP system, the speed may be inaccurate."
1192 echo "Preparing another 50 * 50 files (with error) at $(date)."
1193 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1195 createmany -d $DIR/$tdir/d 50
1196 createmany -m $DIR/$tdir/f 50
1197 for ((i = 0; i < 50; i++)); do
1198 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1201 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1202 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1203 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1204 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1205 mdd.${MDT_DEV}.lfsck_namespace |
1206 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1208 error "(5) unexpected status"
1211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1212 echo "Prepared at $(date)."
1214 local BASE_SPEED1=50
1216 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1219 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1220 [ "$STATUS" == "scanning-phase2" ] ||
1221 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1223 local SPEED=$($SHOW_NAMESPACE |
1224 awk '/^average_speed_phase2/ { print $2 }')
1225 # There may be time error, normally it should be less than 2 seconds.
1226 # We allow another 20% schedule error.
1228 # MAX_MARGIN = 1.3 = 13 / 10
1229 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1230 RUN_TIME1 * 13 / 10))
1231 [ $SPEED -lt $MAX_SPEED ] || {
1233 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1234 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1237 # adjust speed limit
1238 local BASE_SPEED2=150
1240 do_facet $SINGLEMDS \
1241 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1244 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1245 # MIN_MARGIN = 0.7 = 7 / 10
1246 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1247 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1248 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1249 [ $SPEED -gt $MIN_SPEED ] || {
1250 if [ $mds1_FSTYPE != ldiskfs ]; then
1251 error_ignore LU-5624 \
1252 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1255 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1259 # MAX_MARGIN = 1.3 = 13 / 10
1260 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1261 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1262 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1263 [ $SPEED -lt $MAX_SPEED ] || {
1265 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1266 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1267 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1270 do_nodes $(tgts_nodes) "$LCTL set_param -n *.*.lfsck_speed_limit=0"
1271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1272 mdd.${MDT_DEV}.lfsck_namespace |
1273 awk '/^status/ { print \\\$2 }'" "completed" || {
1275 error "(11) unexpected status"
1278 run_test 9b "LFSCK speed control (2)"
1282 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1286 echo "Preparing more files with error at $(date)."
1287 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1290 for ((i = 0; i < 1000; i = $((i+2)))); do
1291 mkdir -p $DIR/$tdir/d${i}
1292 touch $DIR/$tdir/f${i}
1293 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1296 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1299 for ((i = 1; i < 1000; i = $((i+2)))); do
1300 mkdir -p $DIR/$tdir/d${i}
1301 touch $DIR/$tdir/f${i}
1302 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1306 echo "Prepared at $(date)."
1308 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1310 umount_client $MOUNT
1311 mount_client $MOUNT || error "(3) Fail to start client!"
1313 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1316 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1317 [ "$STATUS" == "scanning-phase1" ] ||
1318 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1320 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1322 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1324 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1326 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1328 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1330 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1332 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1334 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1335 error "(14) Fail to softlink!"
1337 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1338 [ "$STATUS" == "scanning-phase1" ] ||
1339 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1341 do_nodes $(tgts_nodes) "$LCTL set_param -n *.*.lfsck_speed_limit=0"
1342 wait_update_facet $SINGLEMDS \
1343 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace |
1344 awk '/^status/ { print \\\$2 }'" "completed" || {
1346 error "(16) unexpected status"
1349 run_test 10 "System is available during LFSCK scanning"
1352 ost_remove_lastid() {
1355 local rcmd="do_facet ost${ost}"
1357 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1359 # step 1: local mount
1360 mount_fstype ost${ost} || return 1
1361 # step 2: remove the specified LAST_ID
1362 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1364 unmount_fstype ost${ost} || return 2
1368 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1369 skip "MDS older than 2.5.55, LU-1267"
1371 check_mount_and_prep
1372 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1373 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1378 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1380 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1381 error "(2) Fail to start ost1"
1383 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1384 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1386 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1387 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1389 wait_update_facet ost1 "$LCTL get_param -n \
1390 obdfilter.${OST_DEV}.lfsck_layout |
1391 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1393 error "(5) unexpected status"
1396 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1398 wait_update_facet ost1 "$LCTL get_param -n \
1399 obdfilter.${OST_DEV}.lfsck_layout |
1400 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1402 error "(6) unexpected status"
1405 echo "the LAST_ID(s) should have been rebuilt"
1406 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1407 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1409 run_test 11a "LFSCK can rebuild lost last_id"
1412 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1413 skip "MDS older than 2.5.55, LU-1267"
1415 check_mount_and_prep
1416 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1418 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1419 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1420 do_facet ost1 $LCTL set_param fail_loc=0x160d
1422 local count=$(precreated_ost_obj_count 0 0)
1424 createmany -o $DIR/$tdir/f $((count + 32))
1426 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1427 local seq=$(do_facet mds1 $LCTL get_param -n \
1428 osp.${proc_path}.prealloc_last_seq)
1429 local id_used=$(do_facet mds1 $LCTL get_param -n \
1430 osp.${proc_path}.prealloc_last_id)
1432 umount_client $MOUNT
1433 stop ost1 || error "(1) Fail to stop ost1"
1435 #define OBD_FAIL_OST_ENOSPC 0x215
1436 do_facet ost1 $LCTL set_param fail_loc=0x215
1438 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1439 error "(2) Fail to start ost1"
1441 for ((i = 0; i < 60; i++)); do
1442 id_ost1=$(do_facet ost1 \
1443 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1444 awk -F: "/$seq/ { print \$2 }")
1445 [ -n "$id_ost1" ] && break
1449 echo "the on-disk LAST_ID should be smaller than the expected one"
1450 [ $id_used -gt $id_ost1 ] ||
1451 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1453 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1454 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1456 wait_update_facet ost1 \
1457 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1460 error "(6) unexpected status"
1463 stop ost1 || error "(7) Fail to stop ost1"
1465 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1466 error "(8) Fail to start ost1"
1468 echo "the on-disk LAST_ID should have been rebuilt"
1469 # last_id may be larger than $id_used if objects were created/skipped
1470 wait_update_facet_cond ost1 \
1471 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1472 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1473 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1474 error "(9) expect last_id >= id_used $seq:$id_used"
1477 do_facet ost1 $LCTL set_param fail_loc=0
1478 stopall || error "(10) Fail to stopall"
1480 run_test 11b "LFSCK can rebuild crashed last_id"
1483 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1484 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1485 skip "MDS older than 2.5.55, LU-3950"
1486 if (( $MDS1_VERSION >= $(version_code 2.15.90) )); then
1487 lfsck_start="lfsck start"
1488 lfsck_stop="lfsck stop"
1490 lfsck_start="lfsck_start"
1491 lfsck_stop="lfsck_stop"
1494 check_mount_and_prep
1495 for k in $(seq $MDSCOUNT); do
1496 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1497 createmany -o $DIR/$tdir/${k}/f 100 ||
1498 error "(0) Fail to create 100 files."
1501 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1502 do_facet mds1 $LCTL $lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1503 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1505 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1506 wait_all_targets namespace scanning-phase1 3
1508 echo "Stop namespace LFSCK on all targets by single lctl command."
1509 do_facet mds1 $LCTL $lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1510 error "(4) Fail to stop LFSCK on all devices!"
1512 echo "All the LFSCK targets should be in 'stopped' status."
1513 wait_all_targets_blocked namespace stopped 5
1515 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1516 do_facet mds1 $LCTL $lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1517 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1519 echo "All the LFSCK targets should be in 'completed' status."
1520 wait_all_targets_blocked namespace completed 7
1522 start_full_debug_logging
1524 echo "Start layout LFSCK on all targets by single command (-s 1)."
1525 do_facet mds1 $LCTL $lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1526 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1528 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1529 wait_all_targets layout scanning-phase1 9
1531 echo "Stop layout LFSCK on all targets by single lctl command."
1532 do_facet mds1 $LCTL $lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1533 error "(10) Fail to stop LFSCK on all devices!"
1535 echo "All the LFSCK targets should be in 'stopped' status."
1536 wait_all_targets_blocked layout stopped 11
1538 for k in $(seq $OSTCOUNT); do
1539 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1540 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1541 awk '/^status/ { print $2 }')
1542 [ "$STATUS" == "stopped" ] ||
1543 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1546 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1547 do_facet mds1 $LCTL $lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1548 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1550 echo "All the LFSCK targets should be in 'completed' status."
1551 wait_all_targets_blocked layout completed 14
1553 stop_full_debug_logging
1555 run_test 12a "single command to trigger LFSCK on all devices"
1558 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1559 skip "MDS older than 2.5.55, LU-3950"
1561 check_mount_and_prep
1563 echo "Start LFSCK without '-M' specified."
1564 do_facet mds1 $LCTL lfsck_start -A -r ||
1565 error "(0) Fail to start LFSCK without '-M'"
1567 wait_all_targets_blocked namespace completed 1
1568 wait_all_targets_blocked layout completed 2
1570 local count=$(do_facet mds1 $LCTL dl |
1571 awk '{ print $3 }' | grep mdt | wc -l)
1572 if [ $count -gt 1 ]; then
1574 echo "Start layout LFSCK on the node with multipe targets,"
1575 echo "but not specify '-M'/'-A' option. Should get failure."
1577 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1578 error "(3) Start layout LFSCK should fail" || true
1581 run_test 12b "auto detect Lustre device"
1584 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1585 skip "MDS older than 2.5.55, LU-3593"
1588 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1589 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1590 echo "MDT-object FID."
1593 check_mount_and_prep
1595 echo "Inject failure stub to simulate bad lmm_oi"
1596 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1597 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1598 createmany -o $DIR/$tdir/f 1
1599 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1600 error "(0) Fail to create PFL $DIR/$tdir/f1"
1601 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1603 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1604 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1607 mdd.${MDT_DEV}.lfsck_layout |
1608 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1610 error "(2) unexpected status"
1613 local repaired=$($SHOW_LAYOUT |
1614 awk '/^repaired_others/ { print $2 }')
1615 [ $repaired -eq 2 ] ||
1616 error "(3) Fail to repair crashed lmm_oi: $repaired"
1618 run_test 13 "LFSCK can repair crashed lmm_oi"
1621 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1622 skip "MDS older than 2.5.55, LU-3590"
1625 echo "The OST-object referenced by the MDT-object should be there;"
1626 echo "otherwise, the LFSCK should re-create the missing OST-object."
1627 echo "without '--delay-create-ostobj' option."
1630 check_mount_and_prep
1631 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1633 echo "Inject failure stub to simulate dangling referenced MDT-object"
1634 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1635 do_facet ost1 $LCTL set_param fail_loc=0x1610
1636 local count=$(precreated_ost_obj_count 0 0)
1638 createmany -o $DIR/$tdir/f $((count + 16)) ||
1639 error "(0.1) Fail to create $DIR/$tdir/fx"
1640 touch $DIR/$tdir/guard0
1642 for ((i = 0; i < 16; i++)); do
1643 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1644 $DIR/$tdir/f_comp${i} ||
1645 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1647 touch $DIR/$tdir/guard1
1649 do_facet ost1 $LCTL set_param fail_loc=0
1651 start_full_debug_logging
1653 # exhaust other pre-created dangling cases
1654 count=$(precreated_ost_obj_count 0 0)
1655 createmany -o $DIR/$tdir/a $count ||
1656 error "(0.5) Fail to create $count files."
1658 echo "'ls' should fail because of dangling referenced MDT-object"
1659 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1661 echo "Trigger layout LFSCK to find out dangling reference"
1662 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1664 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1665 mdd.${MDT_DEV}.lfsck_layout |
1666 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1668 error "(3) unexpected status"
1671 local repaired=$($SHOW_LAYOUT |
1672 awk '/^repaired_dangling/ { print $2 }')
1673 [ $repaired -ge 32 ] ||
1674 error "(4) Fail to repair dangling reference: $repaired"
1676 echo "'stat' should fail because of not repair dangling by default"
1677 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1678 error "(5.1) stat should fail"
1679 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1680 error "(5.2) stat should fail"
1682 echo "Trigger layout LFSCK to repair dangling reference"
1683 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1685 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1686 mdd.${MDT_DEV}.lfsck_layout |
1687 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1689 error "(7) unexpected status"
1692 # There may be some async LFSCK updates in processing, wait for
1693 # a while until the target reparation has been done. LU-4970.
1695 echo "'stat' should success after layout LFSCK repairing"
1696 wait_update_facet client "stat $DIR/$tdir/guard0 |
1697 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1698 stat $DIR/$tdir/guard0
1700 error "(8.1) unexpected size"
1703 wait_update_facet client "stat $DIR/$tdir/guard1 |
1704 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1705 stat $DIR/$tdir/guard1
1707 error "(8.2) unexpected size"
1710 repaired=$($SHOW_LAYOUT |
1711 awk '/^repaired_dangling/ { print $2 }')
1712 [ $repaired -ge 32 ] ||
1713 error "(9) Fail to repair dangling reference: $repaired"
1715 stop_full_debug_logging
1717 echo "stopall to cleanup object cache"
1720 setupall > /dev/null
1722 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1725 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1726 skip "MDS older than 2.5.55, LU-3590"
1729 echo "The OST-object referenced by the MDT-object should be there;"
1730 echo "otherwise, the LFSCK should re-create the missing OST-object."
1731 echo "with '--delay-create-ostobj' option."
1734 check_mount_and_prep
1735 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1737 echo "Inject failure stub to simulate dangling referenced MDT-object"
1738 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1739 do_facet ost1 $LCTL set_param fail_loc=0x1610
1740 local count=$(precreated_ost_obj_count 0 0)
1742 createmany -o $DIR/$tdir/f $((count + 31))
1743 touch $DIR/$tdir/guard
1744 do_facet ost1 $LCTL set_param fail_loc=0
1746 start_full_debug_logging
1748 # exhaust other pre-created dangling cases
1749 count=$(precreated_ost_obj_count 0 0)
1750 createmany -o $DIR/$tdir/a $count ||
1751 error "(0) Fail to create $count files."
1753 echo "'ls' should fail because of dangling referenced MDT-object"
1754 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1756 echo "Trigger layout LFSCK to find out dangling reference"
1757 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1759 wait_all_targets_blocked layout completed 3
1761 local repaired=$($SHOW_LAYOUT |
1762 awk '/^repaired_dangling/ { print $2 }')
1763 [ $repaired -ge 32 ] ||
1764 error "(4) Fail to repair dangling reference: $repaired"
1766 echo "'stat' should fail because of not repair dangling by default"
1767 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1769 echo "Trigger layout LFSCK to repair dangling reference"
1770 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1772 wait_all_targets_blocked layout completed 7
1774 # There may be some async LFSCK updates in processing, wait for
1775 # a while until the target reparation has been done. LU-4970.
1777 echo "'stat' should success after layout LFSCK repairing"
1778 wait_update_facet client "stat $DIR/$tdir/guard |
1779 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1780 stat $DIR/$tdir/guard
1782 error "(8) unexpected size"
1785 repaired=$($SHOW_LAYOUT |
1786 awk '/^repaired_dangling/ { print $2 }')
1787 [ $repaired -ge 32 ] ||
1788 error "(9) Fail to repair dangling reference: $repaired"
1790 stop_full_debug_logging
1792 echo "stopall to cleanup object cache"
1795 setupall > /dev/null
1797 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1800 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1801 skip "MDS older than 2.5.55, LU-3591"
1802 local osts=$(osts_nodes)
1805 echo "If the OST-object referenced by the MDT-object back points"
1806 echo "to some non-exist MDT-object, then the LFSCK should repair"
1807 echo "the OST-object to back point to the right MDT-object."
1810 check_mount_and_prep
1811 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1813 echo "Inject failure stub to make the OST-object to back point to"
1814 echo "non-exist MDT-object."
1815 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1817 do_nodes $osts "$LCTL set_param fail_loc=0x1611"
1818 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1819 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1821 error "(0) Fail to create PFL $DIR/$tdir/f1"
1822 # 'dd' will trigger punch RPC firstly on every OST-objects.
1823 # So even though some OST-object will not be write by 'dd',
1824 # as long as it is allocated (may be NOT allocated in pfl_3b)
1825 # its layout information will be set also.
1826 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1827 cancel_lru_locks osc
1828 do_nodes $osts "$LCTL set_param fail_loc=0"
1830 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1831 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1833 wait_update_facet $SINGLEMDS \
1834 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1835 awk '/^status/ { print \\\$2 }'" "completed" || {
1837 error "(2) unexpected status"
1840 local repaired=$($SHOW_LAYOUT |
1841 awk '/^repaired_unmatched_pair/ { print $2 }')
1842 [ $repaired -ge 3 ] ||
1843 error "(3) Fail to repair unmatched pair: $repaired"
1845 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1848 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1849 skip "MDS older than 2.5.55, LU-3591"
1852 echo "If the OST-object referenced by the MDT-object back points"
1853 echo "to other MDT-object that doesn't recognize the OST-object,"
1854 echo "then the LFSCK should repair it to back point to the right"
1855 echo "MDT-object (the first one)."
1858 check_mount_and_prep
1859 mkdir -p $DIR/$tdir/0
1860 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1861 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1862 cancel_lru_locks osc
1864 echo "Inject failure stub to make the OST-object to back point to"
1865 echo "other MDT-object"
1868 [ $OSTCOUNT -ge 2 ] && stripes=2
1869 local osts=$(osts_nodes)
1871 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1872 do_nodes $osts "$LCTL set_param fail_loc=0x1612"
1873 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1874 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1876 error "(0) Fail to create PFL $DIR/$tdir/f1"
1877 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1878 cancel_lru_locks osc
1879 do_nodes $osts "$LCTL set_param fail_loc=0"
1881 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1882 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1884 wait_update_facet $SINGLEMDS \
1885 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1886 awk '/^status/ { print \\\$2 }'" "completed" || {
1888 error "(2) unexpected status"
1891 local repaired=$($SHOW_LAYOUT |
1892 awk '/^repaired_unmatched_pair/ { print $2 }')
1893 [ $repaired -eq 4 ] ||
1894 error "(3) Fail to repair unmatched pair: $repaired"
1896 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1899 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1900 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1901 skip "MDS newer than 2.7.55, LU-6475"
1902 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1903 skip "MDS older than 2.5.55, LU-3591"
1906 echo "According to current metadata migration implementation,"
1907 echo "before the old MDT-object is removed, both the new MDT-object"
1908 echo "and old MDT-object will reference the same LOV layout. Then if"
1909 echo "the layout LFSCK finds the new MDT-object by race, it will"
1910 echo "regard related OST-object(s) as multiple referenced case, and"
1911 echo "will try to create new OST-object(s) for the new MDT-object."
1912 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1913 echo "MDT-object before confirm the multiple referenced case."
1916 check_mount_and_prep
1917 $LFS mkdir -i 1 $DIR/$tdir/a1
1918 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1919 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1920 cancel_lru_locks osc
1922 echo "Inject failure stub on MDT1 to delay the migration"
1924 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1925 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1926 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1927 $LFS migrate -m 0 $DIR/$tdir/a1 &
1930 echo "Trigger layout LFSCK to race with the migration"
1931 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1933 wait_all_targets_blocked layout completed 2
1935 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1936 local repaired=$($SHOW_LAYOUT |
1937 awk '/^repaired_unmatched_pair/ { print $2 }')
1938 [ $repaired -eq 1 ] ||
1939 error "(3) Fail to repair unmatched pair: $repaired"
1941 repaired=$($SHOW_LAYOUT |
1942 awk '/^repaired_multiple_referenced/ { print $2 }')
1943 [ $repaired -eq 0 ] ||
1944 error "(4) Unexpectedly repaird multiple references: $repaired"
1946 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1949 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1951 check_mount_and_prep
1953 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1954 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1955 error "setdirstripe failed"
1957 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
1958 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
1960 echo "Migrate $DIR/$tdir to MDT1"
1961 $LFS migrate -m 1 $DIR/$tdir &
1965 # fail sub transactions on random MDTs, which may cause some file
1967 #define OBD_FAIL_OUT_EIO 0x1709
1968 for ((i = 0; i < $MDSCOUNT; i++)); do
1969 do_facet mds$i $LCTL set_param fail_loc=0x1709
1971 do_facet mds$i $LCTL set_param fail_loc=0
1976 # LFSCK can't fully fix migrating directories, and may leave some
1977 # files inaccessible, but it shouldn't cause crash
1978 $START_NAMESPACE -A -r ||
1979 error "Fail to start LFSCK for namespace"
1981 wait_all_targets_blocked namespace completed 1
1983 # resume migration may fail because some file may be inaccessible, but
1984 # it shouldn't cause crash
1985 $LFS migrate -m 1 $DIR/$tdir
1987 # rm $tdir to avoid cleanup failure in the end
1989 $LFS rm_entry $DIR/$tdir/*
1991 REFORMAT="yes" cleanup_and_setup_lustre
1993 run_test 15d "LFSCK don't crash upon dir migration failure"
1996 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1997 skip "MDS older than 2.5.55, LU-3594"
2000 echo "If the OST-object's owner information does not match the owner"
2001 echo "information stored in the MDT-object, then the LFSCK trust the"
2002 echo "MDT-object and update the OST-object's owner information."
2005 check_mount_and_prep
2006 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2007 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2008 cancel_lru_locks osc
2010 # created but no setattr or write to the file.
2012 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2013 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2015 echo "Inject failure stub to skip OST-object owner changing"
2016 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2018 chown 1.1 $DIR/$tdir/f0
2019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2021 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2024 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2026 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2027 mdd.${MDT_DEV}.lfsck_layout |
2028 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2030 error "(2) unexpected status"
2033 local repaired=$($SHOW_LAYOUT |
2034 awk '/^repaired_inconsistent_owner/ { print $2 }')
2035 [ $repaired -eq 1 ] ||
2036 error "(3) Fail to repair inconsistent owner: $repaired"
2038 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2041 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2042 skip "MDS older than 2.5.55, LU-3594"
2045 echo "If more than one MDT-objects reference the same OST-object,"
2046 echo "and the OST-object only recognizes one MDT-object, then the"
2047 echo "LFSCK should create new OST-objects for such non-recognized"
2051 check_mount_and_prep
2052 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2054 echo "Inject failure stub to make two MDT-objects to refernce"
2055 echo "the OST-object"
2057 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2058 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2059 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2060 cancel_lru_locks mdc
2061 cancel_lru_locks osc
2063 createmany -o $DIR/$tdir/f 1
2064 cancel_lru_locks mdc
2065 cancel_lru_locks osc
2067 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2069 error "(0) Fail to create PFL $DIR/$tdir/f1"
2070 cancel_lru_locks mdc
2071 cancel_lru_locks osc
2072 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2074 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2075 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2076 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2077 [ $size -eq 1048576 ] ||
2078 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2080 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2081 [ $size -eq 1048576 ] ||
2082 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2084 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2087 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2089 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2090 mdd.${MDT_DEV}.lfsck_layout |
2091 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2093 error "(3) unexpected status"
2096 local repaired=$($SHOW_LAYOUT |
2097 awk '/^repaired_multiple_referenced/ { print $2 }')
2098 [ $repaired -eq 2 ] ||
2099 error "(4) Fail to repair multiple references: $repaired"
2101 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2102 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2103 error "(5) Fail to write f0."
2104 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2105 [ $size -eq 1048576 ] ||
2106 error "(6) guard size should be 1048576, but got $size"
2108 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2109 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2110 error "(7) Fail to write f1."
2111 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2112 [ $size -eq 1048576 ] ||
2113 error "(8) guard size should be 1048576, but got $size"
2115 run_test 17 "LFSCK can repair multiple references"
2117 $LCTL set_param debug=+cache > /dev/null
2120 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2121 skip "MDS older than 2.5.55, LU-3336"
2124 echo "The target MDT-object is there, but related stripe information"
2125 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2126 echo "layout EA entries."
2129 check_mount_and_prep
2130 $LFS mkdir -i 0 $DIR/$tdir/a1
2131 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2132 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2134 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2136 $LFS path2fid $DIR/$tdir/a1/f1
2137 $LFS getstripe $DIR/$tdir/a1/f1
2139 if [ $MDSCOUNT -ge 2 ]; then
2140 $LFS mkdir -i 1 $DIR/$tdir/a2
2141 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2142 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2143 $LFS path2fid $DIR/$tdir/a2/f2
2144 $LFS getstripe $DIR/$tdir/a2/f2
2147 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2148 error "(0) Fail to create PFL $DIR/$tdir/f3"
2150 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2152 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2154 $LFS path2fid $DIR/$tdir/f3
2155 $LFS getstripe $DIR/$tdir/f3
2157 cancel_lru_locks osc
2159 echo "Inject failure, to make the MDT-object lost its layout EA"
2160 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2161 do_facet mds1 $LCTL set_param fail_loc=0x1615
2162 chown 1.1 $DIR/$tdir/a1/f1
2164 if [ $MDSCOUNT -ge 2 ]; then
2165 do_facet mds2 $LCTL set_param fail_loc=0x1615
2166 chown 1.1 $DIR/$tdir/a2/f2
2169 chown 1.1 $DIR/$tdir/f3
2174 do_facet mds1 $LCTL set_param fail_loc=0
2175 if [ $MDSCOUNT -ge 2 ]; then
2176 do_facet mds2 $LCTL set_param fail_loc=0
2179 cancel_lru_locks mdc
2180 cancel_lru_locks osc
2182 echo "The file size should be incorrect since layout EA is lost"
2183 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2184 [ "$cur_size" != "$saved_size1" ] ||
2185 error "(1) Expect incorrect file1 size"
2187 if [ $MDSCOUNT -ge 2 ]; then
2188 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2189 [ "$cur_size" != "$saved_size1" ] ||
2190 error "(2) Expect incorrect file2 size"
2193 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2194 [ "$cur_size" != "$saved_size2" ] ||
2195 error "(1.2) Expect incorrect file3 size"
2197 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2198 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2200 for k in $(seq $MDSCOUNT); do
2201 # The LFSCK status query internal is 30 seconds. For the case
2202 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2203 # time to guarantee the status sync up.
2204 wait_update_facet mds${k} "$LCTL get_param -n \
2205 mdd.$(facet_svc mds${k}).lfsck_layout |
2206 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2207 error "(4) MDS${k} is not the expected 'completed'"
2210 for k in $(seq $OSTCOUNT); do
2211 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2212 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2213 awk '/^status/ { print $2 }')
2214 [ "$cur_status" == "completed" ] ||
2215 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2218 local repaired=$(do_facet mds1 $LCTL get_param -n \
2219 mdd.$(facet_svc mds1).lfsck_layout |
2220 awk '/^repaired_orphan/ { print $2 }')
2221 [ $repaired -eq 3 ] ||
2222 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2224 if [ $MDSCOUNT -ge 2 ]; then
2225 repaired=$(do_facet mds2 $LCTL get_param -n \
2226 mdd.$(facet_svc mds2).lfsck_layout |
2227 awk '/^repaired_orphan/ { print $2 }')
2228 [ $repaired -eq 2 ] ||
2229 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2232 $LFS path2fid $DIR/$tdir/a1/f1
2233 $LFS getstripe $DIR/$tdir/a1/f1
2235 if [ $MDSCOUNT -ge 2 ]; then
2236 $LFS path2fid $DIR/$tdir/a2/f2
2237 $LFS getstripe $DIR/$tdir/a2/f2
2240 $LFS path2fid $DIR/$tdir/f3
2241 $LFS getstripe $DIR/$tdir/f3
2243 echo "The file size should be correct after layout LFSCK scanning"
2244 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2245 [ "$cur_size" == "$saved_size1" ] ||
2246 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2248 if [ $MDSCOUNT -ge 2 ]; then
2249 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2250 [ "$cur_size" == "$saved_size1" ] ||
2251 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2254 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2255 [ "$cur_size" == "$saved_size2" ] ||
2256 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2258 run_test 18a "Find out orphan OST-object and repair it (1)"
2261 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2262 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2263 skip "MDS older than 2.5.55, LU-3336"
2266 echo "The target MDT-object is lost. The LFSCK should re-create the"
2267 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2268 echo "can move it back to normal namespace manually."
2271 check_mount_and_prep
2272 $LFS mkdir -i 0 $DIR/$tdir/a1
2273 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2274 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2275 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2276 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2278 $LFS getstripe $DIR/$tdir/a1/f1
2280 if [ $MDSCOUNT -ge 2 ]; then
2281 $LFS mkdir -i 1 $DIR/$tdir/a2
2282 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2283 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2284 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2286 $LFS getstripe $DIR/$tdir/a2/f2
2289 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2290 error "(0) Fail to create PFL $DIR/$tdir/f3"
2292 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2294 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2295 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2297 $LFS getstripe $DIR/$tdir/f3
2299 cancel_lru_locks osc
2301 echo "Inject failure, to simulate the case of missing the MDT-object"
2302 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2303 do_facet mds1 $LCTL set_param fail_loc=0x1616
2304 rm -f $DIR/$tdir/a1/f1
2306 if [ $MDSCOUNT -ge 2 ]; then
2307 do_facet mds2 $LCTL set_param fail_loc=0x1616
2308 rm -f $DIR/$tdir/a2/f2
2316 do_facet mds1 $LCTL set_param fail_loc=0
2317 if [ $MDSCOUNT -ge 2 ]; then
2318 do_facet mds2 $LCTL set_param fail_loc=0
2321 cancel_lru_locks mdc
2322 cancel_lru_locks osc
2324 # dryrun mode only check orphans, not repaie
2325 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2326 $START_LAYOUT --dryrun -o -r ||
2327 error "Fail to start layout LFSCK in dryrun mode"
2328 wait_all_targets_blocked layout completed 2
2330 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2331 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2332 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2334 local orphans=$(do_facet mds1 $LCTL get_param -n \
2335 mdd.$(facet_svc mds1).lfsck_layout |
2336 awk '/^inconsistent_orphan/ { print $2 }')
2337 [ $orphans -eq 3 ] ||
2338 error "Expect 3 found on mds1, but got: $orphans"
2340 # orphan parents should not be created
2342 for subdir in $MOUNT/.lustre/lost+found/*; do
2343 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2346 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2347 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2349 for k in $(seq $MDSCOUNT); do
2350 # The LFSCK status query internal is 30 seconds. For the case
2351 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2352 # time to guarantee the status sync up.
2353 wait_update_facet mds${k} "$LCTL get_param -n \
2354 mdd.$(facet_svc mds${k}).lfsck_layout |
2355 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2356 error "(2) MDS${k} is not the expected 'completed'"
2359 for k in $(seq $OSTCOUNT); do
2360 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2361 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2362 awk '/^status/ { print $2 }')
2363 [ "$cur_status" == "completed" ] ||
2364 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2367 local repaired=$(do_facet mds1 $LCTL get_param -n \
2368 mdd.$(facet_svc mds1).lfsck_layout |
2369 awk '/^repaired_orphan/ { print $2 }')
2370 [ $repaired -eq 3 ] ||
2371 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2373 if [ $MDSCOUNT -ge 2 ]; then
2374 repaired=$(do_facet mds2 $LCTL get_param -n \
2375 mdd.$(facet_svc mds2).lfsck_layout |
2376 awk '/^repaired_orphan/ { print $2 }')
2377 [ $repaired -eq 2 ] ||
2378 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2381 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2382 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2383 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2385 if [ $MDSCOUNT -ge 2 ]; then
2386 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2387 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2390 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2391 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2393 $LFS path2fid $DIR/$tdir/a1/f1
2394 $LFS getstripe $DIR/$tdir/a1/f1
2396 if [ $MDSCOUNT -ge 2 ]; then
2397 $LFS path2fid $DIR/$tdir/a2/f2
2398 $LFS getstripe $DIR/$tdir/a2/f2
2401 $LFS path2fid $DIR/$tdir/f3
2402 $LFS getstripe $DIR/$tdir/f3
2404 echo "The file size should be correct after layout LFSCK scanning"
2405 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2406 [ "$cur_size" == "$saved_size1" ] ||
2407 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2409 if [ $MDSCOUNT -ge 2 ]; then
2410 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2411 [ "$cur_size" == "$saved_size1" ] ||
2412 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2415 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2416 [ "$cur_size" == "$saved_size2" ] ||
2417 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2419 run_test 18b "Find out orphan OST-object and repair it (2)"
2422 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2423 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2424 skip "MDS older than 2.5.55, LU-3336"
2427 echo "The target MDT-object is lost, and the OST-object FID is missing."
2428 echo "The LFSCK should re-create the MDT-object with new FID under the "
2429 echo "directory .lustre/lost+found/MDTxxxx."
2432 check_mount_and_prep
2433 $LFS mkdir -i 0 $DIR/$tdir/a1
2434 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2435 local osts=$(osts_nodes)
2437 echo "Inject failure, to simulate the case of missing parent FID"
2438 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2439 do_nodes $osts "$LCTL set_param fail_loc=0x1617"
2441 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2442 $LFS getstripe $DIR/$tdir/a1/f1
2444 if [ $MDSCOUNT -ge 2 ]; then
2445 $LFS mkdir -i 1 $DIR/$tdir/a2
2446 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2447 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2448 $LFS getstripe $DIR/$tdir/a2/f2
2451 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2452 error "(0) Fail to create PFL $DIR/$tdir/f3"
2454 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2455 $LFS getstripe $DIR/$tdir/f3
2457 cancel_lru_locks osc
2458 do_nodes $osts "$LCTL set_param fail_loc=0"
2460 echo "Inject failure, to simulate the case of missing the MDT-object"
2461 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2462 do_facet mds1 $LCTL set_param fail_loc=0x1616
2463 rm -f $DIR/$tdir/a1/f1
2465 if [ $MDSCOUNT -ge 2 ]; then
2466 do_facet mds2 $LCTL set_param fail_loc=0x1616
2467 rm -f $DIR/$tdir/a2/f2
2475 do_facet mds1 $LCTL set_param fail_loc=0
2476 if [ $MDSCOUNT -ge 2 ]; then
2477 do_facet mds2 $LCTL set_param fail_loc=0
2480 cancel_lru_locks mdc
2481 cancel_lru_locks osc
2483 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2484 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2486 for k in $(seq $MDSCOUNT); do
2487 # The LFSCK status query internal is 30 seconds. For the case
2488 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2489 # time to guarantee the status sync up.
2490 wait_update_facet mds${k} "$LCTL get_param -n \
2491 mdd.$(facet_svc mds${k}).lfsck_layout |
2492 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2493 error "(2) MDS${k} is not the expected 'completed'"
2496 for k in $(seq $OSTCOUNT); do
2497 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2498 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2499 awk '/^status/ { print $2 }')
2500 [ "$cur_status" == "completed" ] ||
2501 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2504 if [ $MDSCOUNT -ge 2 ]; then
2510 local repaired=$(do_facet mds1 $LCTL get_param -n \
2511 mdd.$(facet_svc mds1).lfsck_layout |
2512 awk '/^repaired_orphan/ { print $2 }')
2513 [ $repaired -eq $expected ] ||
2514 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2516 if [ $MDSCOUNT -ge 2 ]; then
2517 repaired=$(do_facet mds2 $LCTL get_param -n \
2518 mdd.$(facet_svc mds2).lfsck_layout |
2519 awk '/^repaired_orphan/ { print $2 }')
2520 [ $repaired -eq 0 ] ||
2521 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2524 ls -ail $MOUNT/.lustre/lost+found/
2526 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2527 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2528 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2530 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2533 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2534 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2535 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2537 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2538 [ ! -z "$cname" ] ||
2539 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2541 run_test 18c "Find out orphan OST-object and repair it (3)"
2544 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2545 skip "MDS older than 2.5.55, LU-3336"
2548 echo "The target MDT-object layout EA is corrupted, but the right"
2549 echo "OST-object is still alive as orphan. The layout LFSCK will"
2550 echo "not create new OST-object to occupy such slot."
2553 check_mount_and_prep
2555 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2556 echo "guard" > $DIR/$tdir/a1/f1
2557 echo "foo" > $DIR/$tdir/a1/f2
2559 echo "guard" > $DIR/$tdir/a1/f3
2560 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2561 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2562 echo "foo" > $DIR/$tdir/a1/f4
2564 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2565 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2566 $LFS path2fid $DIR/$tdir/a1/f1
2567 $LFS getstripe $DIR/$tdir/a1/f1
2568 $LFS path2fid $DIR/$tdir/a1/f2
2569 $LFS getstripe $DIR/$tdir/a1/f2
2570 $LFS path2fid $DIR/$tdir/a1/f3
2571 $LFS getstripe $DIR/$tdir/a1/f3
2572 $LFS path2fid $DIR/$tdir/a1/f4
2573 $LFS getstripe $DIR/$tdir/a1/f4
2574 cancel_lru_locks osc
2576 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2577 echo "to reference the same OST-object (which is f1's OST-object)."
2578 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2579 echo "dangling reference case, but f2's old OST-object is there."
2581 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2582 echo "to reference the same OST-object (which is f3's OST-object)."
2583 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2584 echo "dangling reference case, but f4's old OST-object is there."
2587 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2588 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2589 chown 1.1 $DIR/$tdir/a1/f2
2590 chown 1.1 $DIR/$tdir/a1/f4
2591 rm -f $DIR/$tdir/a1/f1
2592 rm -f $DIR/$tdir/a1/f3
2595 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2597 echo "stopall to cleanup object cache"
2600 setupall > /dev/null
2602 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2603 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2605 for k in $(seq $MDSCOUNT); do
2606 # The LFSCK status query internal is 30 seconds. For the case
2607 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2608 # time to guarantee the status sync up.
2609 wait_update_facet mds${k} "$LCTL get_param -n \
2610 mdd.$(facet_svc mds${k}).lfsck_layout |
2611 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2612 error "(3) MDS${k} is not the expected 'completed'"
2615 for k in $(seq $OSTCOUNT); do
2616 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2617 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2618 awk '/^status/ { print $2 }')
2619 [ "$cur_status" == "completed" ] ||
2620 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2623 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2624 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2625 awk '/^repaired_orphan/ { print $2 }')
2626 [ $repaired -eq 2 ] ||
2627 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2629 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2630 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2631 awk '/^repaired_dangling/ { print $2 }')
2632 [ $repaired -eq 0 ] ||
2633 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2635 echo "The file size should be correct after layout LFSCK scanning"
2636 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2637 [ "$cur_size" == "$saved_size1" ] ||
2638 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2640 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2641 [ "$cur_size" == "$saved_size2" ] ||
2642 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2644 echo "The LFSCK should find back the original data."
2645 cat $DIR/$tdir/a1/f2
2646 $LFS path2fid $DIR/$tdir/a1/f2
2647 $LFS getstripe $DIR/$tdir/a1/f2
2648 cat $DIR/$tdir/a1/f4
2649 $LFS path2fid $DIR/$tdir/a1/f4
2650 $LFS getstripe $DIR/$tdir/a1/f4
2652 run_test 18d "Find out orphan OST-object and repair it (4)"
2655 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2656 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2657 skip "MDS older than 2.5.55, LU-3336"
2660 echo "The target MDT-object layout EA slot is occpuied by some new"
2661 echo "created OST-object when repair dangling reference case. Such"
2662 echo "conflict OST-object has been modified by others. To keep the"
2663 echo "new data, the LFSCK will create a new file to refernece this"
2664 echo "old orphan OST-object."
2667 check_mount_and_prep
2669 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2670 echo "guard" > $DIR/$tdir/a1/f1
2671 echo "foo" > $DIR/$tdir/a1/f2
2673 echo "guard" > $DIR/$tdir/a1/f3
2674 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2675 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2676 echo "foo" > $DIR/$tdir/a1/f4
2678 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2679 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2681 $LFS path2fid $DIR/$tdir/a1/f1
2682 $LFS getstripe $DIR/$tdir/a1/f1
2683 $LFS path2fid $DIR/$tdir/a1/f2
2684 $LFS getstripe $DIR/$tdir/a1/f2
2685 $LFS path2fid $DIR/$tdir/a1/f3
2686 $LFS getstripe $DIR/$tdir/a1/f3
2687 $LFS path2fid $DIR/$tdir/a1/f4
2688 $LFS getstripe $DIR/$tdir/a1/f4
2689 cancel_lru_locks osc
2691 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2692 echo "to reference the same OST-object (which is f1's OST-object)."
2693 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2694 echo "dangling reference case, but f2's old OST-object is there."
2696 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2697 echo "to reference the same OST-object (which is f3's OST-object)."
2698 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2699 echo "dangling reference case, but f4's old OST-object is there."
2702 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2703 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2704 chown 1.1 $DIR/$tdir/a1/f2
2705 chown 1.1 $DIR/$tdir/a1/f4
2706 rm -f $DIR/$tdir/a1/f1
2707 rm -f $DIR/$tdir/a1/f3
2710 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2712 echo "stopall to cleanup object cache"
2715 setupall > /dev/null
2719 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2720 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2722 start_full_debug_logging
2724 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2725 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2727 wait_update_facet mds1 "$LCTL get_param -n \
2728 mdd.$(facet_svc mds1).lfsck_layout |
2729 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2730 error "(3) MDS1 is not the expected 'scanning-phase2'"
2732 # to guarantee all updates are synced.
2736 echo "Write new data to f2/f4 to modify the new created OST-object."
2737 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2738 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2740 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2742 for k in $(seq $MDSCOUNT); do
2743 # The LFSCK status query internal is 30 seconds. For the case
2744 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2745 # time to guarantee the status sync up.
2746 wait_update_facet mds${k} "$LCTL get_param -n \
2747 mdd.$(facet_svc mds${k}).lfsck_layout |
2748 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2749 error "(4) MDS${k} is not the expected 'completed'"
2752 for k in $(seq $OSTCOUNT); do
2753 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2754 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2755 awk '/^status/ { print $2 }')
2756 [ "$cur_status" == "completed" ] ||
2757 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2760 stop_full_debug_logging
2762 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2763 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2764 awk '/^repaired_orphan/ { print $2 }')
2765 [ $repaired -eq 2 ] ||
2766 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2768 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2769 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2770 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2772 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2773 if [ $count -ne 2 ]; then
2774 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2775 error "(8) Expect 2 stubs under lost+found, but got $count"
2778 echo "The stub file should keep the original f2 or f4 data"
2779 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2780 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2781 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2782 error "(9) Got unexpected $cur_size"
2785 $LFS path2fid $cname
2786 $LFS getstripe $cname
2788 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2789 cur_size=$(ls -il $cname | awk '{ print $6 }')
2790 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2791 error "(10) Got unexpected $cur_size"
2794 $LFS path2fid $cname
2795 $LFS getstripe $cname
2797 echo "The f2/f4 should contains new data."
2798 cat $DIR/$tdir/a1/f2
2799 $LFS path2fid $DIR/$tdir/a1/f2
2800 $LFS getstripe $DIR/$tdir/a1/f2
2801 cat $DIR/$tdir/a1/f4
2802 $LFS path2fid $DIR/$tdir/a1/f4
2803 $LFS getstripe $DIR/$tdir/a1/f4
2805 run_test 18e "Find out orphan OST-object and repair it (5)"
2808 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2811 echo "The target MDT-object is lost. The LFSCK should re-create the"
2812 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2813 echo "to verify some OST-object(s) during the first stage-scanning,"
2814 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2815 echo "should not be affected."
2818 check_mount_and_prep
2819 $LFS mkdir -i 0 $DIR/$tdir/a1
2820 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2821 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2822 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2823 $LFS mkdir -i 0 $DIR/$tdir/a2
2824 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2825 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2826 $LFS getstripe $DIR/$tdir/a1/f1
2827 $LFS getstripe $DIR/$tdir/a2/f2
2829 if [ $MDSCOUNT -ge 2 ]; then
2830 $LFS mkdir -i 1 $DIR/$tdir/a3
2831 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2832 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2833 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2834 $LFS mkdir -i 1 $DIR/$tdir/a4
2835 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2836 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2837 $LFS getstripe $DIR/$tdir/a3/f3
2838 $LFS getstripe $DIR/$tdir/a4/f4
2841 cancel_lru_locks osc
2843 echo "Inject failure, to simulate the case of missing the MDT-object"
2844 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2845 do_facet mds1 $LCTL set_param fail_loc=0x1616
2846 rm -f $DIR/$tdir/a1/f1
2847 rm -f $DIR/$tdir/a2/f2
2849 if [ $MDSCOUNT -ge 2 ]; then
2850 do_facet mds2 $LCTL set_param fail_loc=0x1616
2851 rm -f $DIR/$tdir/a3/f3
2852 rm -f $DIR/$tdir/a4/f4
2858 do_facet mds1 $LCTL set_param fail_loc=0
2859 if [ $MDSCOUNT -ge 2 ]; then
2860 do_facet mds2 $LCTL set_param fail_loc=0
2863 cancel_lru_locks mdc
2864 cancel_lru_locks osc
2866 echo "Inject failure, to simulate the OST0 fail to handle"
2867 echo "MDT0 LFSCK request during the first-stage scanning."
2868 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2869 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2871 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2872 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2874 for k in $(seq $MDSCOUNT); do
2875 # The LFSCK status query internal is 30 seconds. For the case
2876 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2877 # time to guarantee the status sync up.
2878 wait_update_facet mds${k} "$LCTL get_param -n \
2879 mdd.$(facet_svc mds${k}).lfsck_layout |
2880 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2881 error "(2) MDS${k} is not the expected 'partial'"
2884 wait_update_facet ost1 "$LCTL get_param -n \
2885 obdfilter.$(facet_svc ost1).lfsck_layout |
2886 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2887 error "(3) OST1 is not the expected 'partial'"
2890 wait_update_facet ost2 "$LCTL get_param -n \
2891 obdfilter.$(facet_svc ost2).lfsck_layout |
2892 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2893 error "(4) OST2 is not the expected 'completed'"
2896 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2898 local repaired=$(do_facet mds1 $LCTL get_param -n \
2899 mdd.$(facet_svc mds1).lfsck_layout |
2900 awk '/^repaired_orphan/ { print $2 }')
2901 [ $repaired -eq 1 ] ||
2902 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2904 if [ $MDSCOUNT -ge 2 ]; then
2905 repaired=$(do_facet mds2 $LCTL get_param -n \
2906 mdd.$(facet_svc mds2).lfsck_layout |
2907 awk '/^repaired_orphan/ { print $2 }')
2908 [ $repaired -eq 1 ] ||
2909 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2912 echo "Trigger layout LFSCK on all devices again to cleanup"
2913 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2915 for k in $(seq $MDSCOUNT); do
2916 # The LFSCK status query internal is 30 seconds. For the case
2917 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2918 # time to guarantee the status sync up.
2919 wait_update_facet mds${k} "$LCTL get_param -n \
2920 mdd.$(facet_svc mds${k}).lfsck_layout |
2921 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2922 error "(8) MDS${k} is not the expected 'completed'"
2925 for k in $(seq $OSTCOUNT); do
2926 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2927 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2928 awk '/^status/ { print $2 }')
2929 [ "$cur_status" == "completed" ] ||
2930 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2934 local repaired=$(do_facet mds1 $LCTL get_param -n \
2935 mdd.$(facet_svc mds1).lfsck_layout |
2936 awk '/^repaired_orphan/ { print $2 }')
2937 [ $repaired -eq 2 ] ||
2938 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2940 if [ $MDSCOUNT -ge 2 ]; then
2941 repaired=$(do_facet mds2 $LCTL get_param -n \
2942 mdd.$(facet_svc mds2).lfsck_layout |
2943 awk '/^repaired_orphan/ { print $2 }')
2944 [ $repaired -eq 2 ] ||
2945 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2948 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2951 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2954 echo "The target MDT-object is lost, but related OI mapping is there"
2955 echo "The LFSCK should recreate the lost MDT-object without affected"
2956 echo "by the stale OI mapping."
2959 check_mount_and_prep
2960 $LFS mkdir -i 0 $DIR/$tdir/a1
2961 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2962 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2963 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2965 $LFS getstripe $DIR/$tdir/a1/f1
2966 cancel_lru_locks osc
2968 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2969 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2970 do_facet mds1 $LCTL set_param fail_loc=0x162e
2971 rm -f $DIR/$tdir/a1/f1
2973 do_facet mds1 $LCTL set_param fail_loc=0
2974 cancel_lru_locks mdc
2975 cancel_lru_locks osc
2977 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2978 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2980 for k in $(seq $MDSCOUNT); do
2981 # The LFSCK status query internal is 30 seconds. For the case
2982 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2983 # time to guarantee the status sync up.
2984 wait_update_facet mds${k} "$LCTL get_param -n \
2985 mdd.$(facet_svc mds${k}).lfsck_layout |
2986 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2987 error "(2) MDS${k} is not the expected 'completed'"
2990 for k in $(seq $OSTCOUNT); do
2991 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2992 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2993 awk '/^status/ { print $2 }')
2994 [ "$cur_status" == "completed" ] ||
2995 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2998 local repaired=$(do_facet mds1 $LCTL get_param -n \
2999 mdd.$(facet_svc mds1).lfsck_layout |
3000 awk '/^repaired_orphan/ { print $2 }')
3001 [ $repaired -eq $OSTCOUNT ] ||
3002 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
3004 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
3005 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
3006 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3008 $LFS path2fid $DIR/$tdir/a1/f1
3009 $LFS getstripe $DIR/$tdir/a1/f1
3011 run_test 18g "Find out orphan OST-object and repair it (7)"
3015 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3016 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3017 echo "scanning its OST-object(s). Then in the second stage scanning,"
3018 echo "the OST will return related OST-object(s) to the MDT as orphan."
3019 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3020 echo "the 'orphan(s)' stripe information."
3023 check_mount_and_prep
3025 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3026 error "(0) Fail to create PFL $DIR/$tdir/f0"
3028 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3029 error "(1.1) Fail to write $DIR/$tdir/f0"
3031 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3032 error "(1.2) Fail to write $DIR/$tdir/f0"
3034 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3036 echo "Inject failure stub to simulate bad PFL extent range"
3037 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3038 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3040 chown 1.1 $DIR/$tdir/f0
3042 cancel_lru_locks mdc
3043 cancel_lru_locks osc
3044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3046 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3047 error "(2) Write to bad PFL file should fail"
3049 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3050 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3052 for k in $(seq $MDSCOUNT); do
3053 # The LFSCK status query internal is 30 seconds. For the case
3054 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3055 # time to guarantee the status sync up.
3056 wait_update_facet mds${k} "$LCTL get_param -n \
3057 mdd.$(facet_svc mds${k}).lfsck_layout |
3058 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3059 error "(4.1) MDS${k} is not the expected 'completed'"
3062 for k in $(seq $OSTCOUNT); do
3063 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3064 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3065 awk '/^status/ { print $2 }')
3066 [ "$cur_status" == "completed" ] ||
3067 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3071 local repaired=$($SHOW_LAYOUT |
3072 awk '/^repaired_orphan/ { print $2 }')
3073 [ $repaired -eq 2 ] ||
3074 error "(5) Fail to repair crashed PFL range: $repaired"
3076 echo "Data in $DIR/$tdir/f0 should not be broken"
3077 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3078 error "(6) Data in $DIR/$tdir/f0 is broken"
3080 echo "Write should succeed after LFSCK repairing the bad PFL range"
3081 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3082 error "(7) Write should succeed after LFSCK"
3084 run_test 18h "LFSCK can repair crashed PFL extent range"
3086 $LCTL set_param debug=-cache > /dev/null
3089 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3090 skip "MDS older than 2.5.55, LU-3951"
3092 check_mount_and_prep
3093 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3094 local osts=$(osts_nodes)
3096 do_nodes $osts $LCTL set_param -n \
3097 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid=0
3099 echo "foo1" > $DIR/$tdir/a0
3100 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3101 error "(0) Fail to create PFL $DIR/$tdir/a1"
3102 echo "foo2" > $DIR/$tdir/a1
3103 echo "guard" > $DIR/$tdir/a2
3104 cancel_lru_locks osc
3106 echo "Inject failure, then client will offer wrong parent FID when read"
3107 do_nodes $osts $LCTL set_param -n \
3108 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid=1
3110 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3111 $LCTL set_param fail_loc=0x1619
3113 echo "Read RPC with wrong parent FID should be denied"
3114 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3115 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3116 $LCTL set_param fail_loc=0
3118 run_test 19a "OST-object inconsistency self detect"
3121 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3122 skip "MDS older than 2.5.55, LU-3951"
3124 check_mount_and_prep
3125 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3127 echo "Inject failure stub to make the OST-object to back point to"
3128 echo "non-exist MDT-object"
3129 local osts=$(osts_nodes)
3131 do_nodes $osts $LCTL set_param -n \
3132 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid=0
3134 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3135 do_nodes $osts "$LCTL set_param fail_loc=0x1611"
3136 echo "foo1" > $DIR/$tdir/f0
3137 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3138 error "(0) Fail to create PFL $DIR/$tdir/f1"
3139 echo "foo2" > $DIR/$tdir/f1
3140 cancel_lru_locks osc
3141 do_nodes $osts "$LCTL set_param fail_loc=0"
3143 do_facet ost1 $LCTL set_param -n \
3144 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid=0
3145 echo "Nothing should be fixed since self detect and repair is disabled"
3146 local repaired=$(do_facet ost1 $LCTL get_param -n \
3147 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3148 awk '/^repaired/ { print $2 }')
3149 [ $repaired -eq 0 ] ||
3150 error "(1) Expected 0 repaired, but got $repaired"
3152 echo "Read RPC with right parent FID should be accepted,"
3153 echo "and cause parent FID on OST to be fixed"
3155 do_nodes $osts $LCTL set_param -n \
3156 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid=1
3158 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3159 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3161 repaired=$(do_facet ost1 $LCTL get_param -n \
3162 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3163 awk '/^repaired/ { print $2 }')
3164 (( $repaired == 2 )) ||
3165 error "(3) Expected 1 repaired, but got $repaired"
3167 run_test 19b "OST-object inconsistency self repair"
3169 PATTERN_WITH_HOLE="40000001"
3170 PATTERN_WITHOUT_HOLE="raid0"
3173 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3174 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3175 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3176 skip "MDS older than 2.5.55, LU-4887"
3179 echo "The target MDT-object and some of its OST-object are lost."
3180 echo "The LFSCK should find out the left OST-objects and re-create"
3181 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3182 echo "with the partial OST-objects (LOV EA hole)."
3184 echo "New client can access the file with LOV EA hole via normal"
3185 echo "system tools or commands without crash the system."
3187 echo "For old client, even though it cannot access the file with"
3188 echo "LOV EA hole, it should not cause the system crash."
3191 check_mount_and_prep
3192 $LFS mkdir -i 0 $DIR/$tdir/a1
3193 if [ $OSTCOUNT -gt 2 ]; then
3194 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3197 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3201 # 256 blocks on the stripe0.
3202 # 1 block on the stripe1 for 2 OSTs case.
3203 # 256 blocks on the stripe1 for other cases.
3204 # 1 block on the stripe2 if OSTs > 2
3205 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3206 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3207 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3209 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3210 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3211 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3214 $LFS getstripe $DIR/$tdir/a1/f0
3216 $LFS getstripe $DIR/$tdir/a1/f1
3218 $LFS getstripe $DIR/$tdir/a1/f2
3220 if [ $OSTCOUNT -gt 2 ]; then
3221 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3222 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3224 $LFS getstripe $DIR/$tdir/a1/f3
3227 cancel_lru_locks osc
3229 echo "Inject failure..."
3230 echo "To simulate f0 lost MDT-object"
3231 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3232 do_facet mds1 $LCTL set_param fail_loc=0x1616
3233 rm -f $DIR/$tdir/a1/f0
3235 echo "To simulate f1 lost MDT-object and OST-object0"
3236 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3237 do_facet mds1 $LCTL set_param fail_loc=0x161a
3238 rm -f $DIR/$tdir/a1/f1
3240 echo "To simulate f2 lost MDT-object and OST-object1"
3241 do_facet mds1 $LCTL set_param fail_val=1
3242 rm -f $DIR/$tdir/a1/f2
3244 if [ $OSTCOUNT -gt 2 ]; then
3245 echo "To simulate f3 lost MDT-object and OST-object2"
3246 do_facet mds1 $LCTL set_param fail_val=2
3247 rm -f $DIR/$tdir/a1/f3
3250 umount_client $MOUNT
3253 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3255 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3256 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3258 for k in $(seq $MDSCOUNT); do
3259 # The LFSCK status query internal is 30 seconds. For the case
3260 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3261 # time to guarantee the status sync up.
3262 wait_update_facet mds${k} "$LCTL get_param -n \
3263 mdd.$(facet_svc mds${k}).lfsck_layout |
3264 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3265 error "(2) MDS${k} is not the expected 'completed'"
3268 for k in $(seq $OSTCOUNT); do
3269 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3270 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3271 awk '/^status/ { print $2 }')
3272 [ "$cur_status" == "completed" ] ||
3273 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3276 local repaired=$(do_facet mds1 $LCTL get_param -n \
3277 mdd.$(facet_svc mds1).lfsck_layout |
3278 awk '/^repaired_orphan/ { print $2 }')
3279 if [ $OSTCOUNT -gt 2 ]; then
3280 [ $repaired -eq 9 ] ||
3281 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3283 [ $repaired -eq 4 ] ||
3284 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3287 mount_client $MOUNT || error "(5.0) Fail to start client!"
3289 LOV_PATTERN_F_HOLE=0x40000000
3292 # ${fid0}-R-0 is the old f0
3294 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3295 echo "Check $name, which is the old f0"
3297 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3299 local pattern=$($LFS getstripe -L $name)
3300 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3301 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3303 local stripes=$($LFS getstripe -c $name)
3304 if [ $OSTCOUNT -gt 2 ]; then
3305 [ $stripes -eq 3 ] ||
3306 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3308 [ $stripes -eq 2 ] ||
3309 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3312 local size=$(stat $name | awk '/Size:/ { print $2 }')
3313 [ $size -eq $((4096 * $bcount)) ] ||
3314 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3316 cat $name > /dev/null || error "(5.5) cannot read $name"
3318 echo "dummy" >> $name || error "(5.6) cannot write $name"
3320 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3322 touch $name || error "(5.8) cannot touch $name"
3324 rm -f $name || error "(5.9) cannot unlink $name"
3327 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3329 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3330 if [ $OSTCOUNT -gt 2 ]; then
3331 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3333 echo "Check $name, it contains the old f1's stripe1"
3336 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3338 pattern=$($LFS getstripe -L $name)
3339 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3340 error "(6.2) expect pattern flag hole, but got $pattern"
3342 stripes=$($LFS getstripe -c $name)
3343 if [ $OSTCOUNT -gt 2 ]; then
3344 [ $stripes -eq 3 ] ||
3345 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3347 [ $stripes -eq 2 ] ||
3348 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3351 size=$(stat $name | awk '/Size:/ { print $2 }')
3352 [ $size -eq $((4096 * $bcount)) ] ||
3353 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3355 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3357 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3358 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3361 [ $failures -eq 256 ] ||
3362 error "(6.6) expect 256 IO failures, but get $failures"
3364 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3365 [ $size -eq $((4096 * $bcount)) ] ||
3366 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3368 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3369 error "(6.8) write to the LOV EA hole should fail"
3371 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3372 error "(6.9) write to normal stripe should NOT fail"
3374 echo "foo" >> $name && error "(6.10) append write $name should fail"
3376 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3378 touch $name || error "(6.12) cannot touch $name"
3380 rm -f $name || error "(6.13) cannot unlink $name"
3383 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3385 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3386 if [ $OSTCOUNT -gt 2 ]; then
3387 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3389 echo "Check $name, it contains the old f2's stripe0"
3392 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3394 pattern=$($LFS getstripe -L $name)
3395 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3396 error "(7.2) expect pattern flag hole, but got $pattern"
3398 stripes=$($LFS getstripe -c $name)
3399 size=$(stat $name | awk '/Size:/ { print $2 }')
3400 if [ $OSTCOUNT -gt 2 ]; then
3401 [ $stripes -eq 3 ] ||
3402 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3404 [ $size -eq $((4096 * $bcount)) ] ||
3405 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3407 cat $name > /dev/null &&
3408 error "(7.5.1) normal read $name should fail"
3410 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3411 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3413 [ $failures -eq 256 ] ||
3414 error "(7.6) expect 256 IO failures, but get $failures"
3416 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3417 [ $size -eq $((4096 * $bcount)) ] ||
3418 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3420 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3421 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3423 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3424 error "(7.8.1) write to normal stripe should NOT fail"
3426 echo "foo" >> $name &&
3427 error "(7.8.3) append write $name should fail"
3429 chown $RUNAS_ID:$RUNAS_GID $name ||
3430 error "(7.9.1) cannot chown on $name"
3432 touch $name || error "(7.10.1) cannot touch $name"
3434 [ $stripes -eq 2 ] ||
3435 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3438 [ $size -eq $((4096 * (256 + 0))) ] ||
3439 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3441 cat $name > /dev/null &&
3442 error "(7.5.2) normal read $name should fail"
3444 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3445 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3446 [ $failures -eq 256 ] ||
3447 error "(7.6.2) expect 256 IO failures, but get $failures"
3450 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3451 [ $size -eq $((4096 * $bcount)) ] ||
3452 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3454 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3455 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3457 chown $RUNAS_ID:$RUNAS_GID $name ||
3458 error "(7.9.2) cannot chown on $name"
3460 touch $name || error "(7.10.2) cannot touch $name"
3463 rm -f $name || error "(7.11) cannot unlink $name"
3465 [ $OSTCOUNT -le 2 ] && return
3468 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3470 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3471 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3473 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3475 pattern=$($LFS getstripe -L $name)
3476 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3477 error "(8.2) expect pattern flag hole, but got $pattern"
3479 stripes=$($LFS getstripe -c $name)
3480 [ $stripes -eq 3 ] ||
3481 error "(8.3) expect the stripe count is 3, but got $stripes"
3483 size=$(stat $name | awk '/Size:/ { print $2 }')
3485 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3486 error "(8.4) expect the size $((4096 * 512)), but got $size"
3488 cat $name > /dev/null &&
3489 error "(8.5) normal read $name should fail"
3491 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3492 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3494 [ $failures -eq 256 ] ||
3495 error "(8.6) expect 256 IO failures, but get $failures"
3498 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3499 [ $size -eq $((4096 * $bcount)) ] ||
3500 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3502 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3503 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3505 chown $RUNAS_ID:$RUNAS_GID $name ||
3506 error "(8.9) cannot chown on $name"
3508 touch $name || error "(8.10) cannot touch $name"
3510 rm -f $name || error "(8.11) cannot unlink $name"
3512 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3515 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3516 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3517 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3518 skip "MDS older than 2.5.55, LU-4887"
3521 echo "The target MDT-object and some of its OST-object are lost."
3522 echo "The LFSCK should find out the left OST-objects and re-create"
3523 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3524 echo "with the partial OST-objects (LOV EA hole)."
3526 echo "New client can access the file with LOV EA hole via normal"
3527 echo "system tools or commands without crash the system - PFL case."
3530 check_mount_and_prep
3532 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3533 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3534 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3535 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3536 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3537 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3539 local bcount=$((256 * 3 + 1))
3541 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3542 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3543 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3545 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3546 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3547 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3550 $LFS getstripe $DIR/$tdir/f0
3552 $LFS getstripe $DIR/$tdir/f1
3554 $LFS getstripe $DIR/$tdir/f2
3556 cancel_lru_locks mdc
3557 cancel_lru_locks osc
3559 echo "Inject failure..."
3560 echo "To simulate f0 lost MDT-object"
3561 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3562 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3565 echo "To simulate the case of f1 lost MDT-object and "
3566 echo "the first OST-object in each PFL component"
3567 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3571 echo "To simulate the case of f2 lost MDT-object and "
3572 echo "the second OST-object in each PFL component"
3573 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3578 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3580 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3581 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3583 for k in $(seq $MDSCOUNT); do
3584 # The LFSCK status query internal is 30 seconds. For the case
3585 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3586 # time to guarantee the status sync up.
3587 wait_update_facet mds${k} "$LCTL get_param -n \
3588 mdd.$(facet_svc mds${k}).lfsck_layout |
3589 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3590 error "(4) MDS${k} is not the expected 'completed'"
3593 for k in $(seq $OSTCOUNT); do
3594 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3595 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3596 awk '/^status/ { print $2 }')
3597 [ "$cur_status" == "completed" ] ||
3598 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3601 local repaired=$(do_facet mds1 $LCTL get_param -n \
3602 mdd.$(facet_svc mds1).lfsck_layout |
3603 awk '/^repaired_orphan/ { print $2 }')
3604 [ $repaired -eq 8 ] ||
3605 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3608 # ${fid0}-R-0 is the old f0
3610 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3611 echo "Check $name, which is the old f0"
3613 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3615 local pattern=$($LFS getstripe -L -I1 $name)
3616 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3617 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3619 pattern=$($LFS getstripe -L -I2 $name)
3620 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3621 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3623 local stripes=$($LFS getstripe -c -I1 $name)
3624 [ $stripes -eq 2 ] ||
3625 error "(7.3.1) expect 2 stripes, but got $stripes"
3627 stripes=$($LFS getstripe -c -I2 $name)
3628 [ $stripes -eq 2 ] ||
3629 error "(7.3.2) expect 2 stripes, but got $stripes"
3631 local e_start=$($LFS getstripe -I1 $name |
3632 awk '/lcme_extent.e_start:/ { print $2 }')
3633 [ $e_start -eq 0 ] ||
3634 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3636 local e_end=$($LFS getstripe -I1 $name |
3637 awk '/lcme_extent.e_end:/ { print $2 }')
3638 [ $e_end -eq 2097152 ] ||
3639 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3641 e_start=$($LFS getstripe -I2 $name |
3642 awk '/lcme_extent.e_start:/ { print $2 }')
3643 [ $e_start -eq 2097152 ] ||
3644 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3646 e_end=$($LFS getstripe -I2 $name |
3647 awk '/lcme_extent.e_end:/ { print $2 }')
3648 [ "$e_end" = "EOF" ] ||
3649 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3651 local size=$(stat $name | awk '/Size:/ { print $2 }')
3652 [ $size -eq $((4096 * $bcount)) ] ||
3653 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3655 cat $name > /dev/null || error "(7.7) cannot read $name"
3657 echo "dummy" >> $name || error "(7.8) cannot write $name"
3659 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3661 touch $name || error "(7.10) cannot touch $name"
3663 rm -f $name || error "(7.11) cannot unlink $name"
3666 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3668 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3669 echo "Check $name, it contains f1's second OST-object in each COMP"
3671 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3673 pattern=$($LFS getstripe -L -I1 $name)
3674 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3675 error "(8.2.1) expect pattern flag hole, but got $pattern"
3677 pattern=$($LFS getstripe -L -I2 $name)
3678 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3679 error "(8.2.2) expect pattern flag hole, but got $pattern"
3681 stripes=$($LFS getstripe -c -I1 $name)
3682 [ $stripes -eq 2 ] ||
3683 error "(8.3.2) expect 2 stripes, but got $stripes"
3685 stripes=$($LFS getstripe -c -I2 $name)
3686 [ $stripes -eq 2 ] ||
3687 error "(8.3.2) expect 2 stripes, but got $stripes"
3689 e_start=$($LFS getstripe -I1 $name |
3690 awk '/lcme_extent.e_start:/ { print $2 }')
3691 [ $e_start -eq 0 ] ||
3692 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3694 e_end=$($LFS getstripe -I1 $name |
3695 awk '/lcme_extent.e_end:/ { print $2 }')
3696 [ $e_end -eq 2097152 ] ||
3697 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3699 e_start=$($LFS getstripe -I2 $name |
3700 awk '/lcme_extent.e_start:/ { print $2 }')
3701 [ $e_start -eq 2097152 ] ||
3702 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3704 e_end=$($LFS getstripe -I2 $name |
3705 awk '/lcme_extent.e_end:/ { print $2 }')
3706 [ "$e_end" = "EOF" ] ||
3707 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3709 size=$(stat $name | awk '/Size:/ { print $2 }')
3710 [ $size -eq $((4096 * $bcount)) ] ||
3711 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3713 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3715 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3716 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3718 # The first stripe in each COMP was lost
3719 [ $failures -eq 512 ] ||
3720 error "(8.8) expect 512 IO failures, but get $failures"
3722 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3723 [ $size -eq $((4096 * $bcount)) ] ||
3724 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3726 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3727 error "(8.10) write to the LOV EA hole should fail"
3729 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3730 error "(8.11) write to normal stripe should NOT fail"
3732 echo "foo" >> $name && error "(8.12) append write $name should fail"
3734 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3736 touch $name || error "(8.14) cannot touch $name"
3738 rm -f $name || error "(8.15) cannot unlink $name"
3741 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3743 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3744 echo "Check $name, it contains f2's first stripe in each COMP"
3746 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3748 pattern=$($LFS getstripe -L -I1 $name)
3749 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3750 error "(9.2.1) expect pattern flag hole, but got $pattern"
3752 pattern=$($LFS getstripe -L -I2 $name)
3753 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3754 error "(9.2.2) expect pattern flag hole, but got $pattern"
3756 stripes=$($LFS getstripe -c -I1 $name)
3757 [ $stripes -eq 2 ] ||
3758 error "(9.3.2) expect 2 stripes, but got $stripes"
3760 stripes=$($LFS getstripe -c -I2 $name)
3761 [ $stripes -eq 2 ] ||
3762 error "(9.3.2) expect 2 stripes, but got $stripes"
3764 e_start=$($LFS getstripe -I1 $name |
3765 awk '/lcme_extent.e_start:/ { print $2 }')
3766 [ $e_start -eq 0 ] ||
3767 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3769 e_end=$($LFS getstripe -I1 $name |
3770 awk '/lcme_extent.e_end:/ { print $2 }')
3771 [ $e_end -eq 2097152 ] ||
3772 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3774 e_start=$($LFS getstripe -I2 $name |
3775 awk '/lcme_extent.e_start:/ { print $2 }')
3776 [ $e_start -eq 2097152 ] ||
3777 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3779 e_end=$($LFS getstripe -I2 $name |
3780 awk '/lcme_extent.e_end:/ { print $2 }')
3781 [ "$e_end" = "EOF" ] ||
3782 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3784 size=$(stat $name | awk '/Size:/ { print $2 }')
3785 # The second stripe in COMP was lost, so we do not know there
3786 # have ever been some data before. 'stat' will regard it as
3787 # no data on the lost stripe.
3789 [ $size -eq $((4096 * $bcount)) ] ||
3790 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3792 cat $name > /dev/null &&
3793 error "(9.7) normal read $name should fail"
3795 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3796 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3797 [ $failures -eq 512 ] ||
3798 error "(9.8) expect 256 IO failures, but get $failures"
3800 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3801 # The second stripe in COMP was lost, so we do not know there
3802 # have ever been some data before. Since 'dd' skip failure,
3803 # it will regard the lost stripe contains data.
3805 [ $size -eq $((4096 * $bcount)) ] ||
3806 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3808 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3809 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3811 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3812 error "(9.11) write to normal stripe should NOT fail"
3814 echo "foo" >> $name &&
3815 error "(9.12) append write $name should fail"
3817 chown $RUNAS_ID:$RUNAS_GID $name ||
3818 error "(9.13) cannot chown on $name"
3820 touch $name || error "(9.14) cannot touch $name"
3822 rm -f $name || error "(7.15) cannot unlink $name"
3824 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3827 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3828 skip "MDS older than 2.5.59, LU-4887"
3830 check_mount_and_prep
3831 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3833 echo "Start all LFSCK components by default (-s 1)"
3834 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3835 error "Fail to start LFSCK"
3837 echo "namespace LFSCK should be in 'scanning-phase1' status"
3838 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3839 [ "$STATUS" == "scanning-phase1" ] ||
3840 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3842 echo "layout LFSCK should be in 'scanning-phase1' status"
3843 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3844 [ "$STATUS" == "scanning-phase1" ] ||
3845 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3847 echo "Stop all LFSCK components by default"
3848 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3849 error "Fail to stop LFSCK"
3851 run_test 21 "run all LFSCK components by default"
3854 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3855 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3856 skip "MDS older than 2.6.50, LU-5511"
3859 echo "The parent_A references the child directory via some name entry,"
3860 echo "but the child directory back references another parent_B via its"
3861 echo "".." name entry. The parent_B does not exist. Then the namespace"
3862 echo "LFSCK will repair the child directory's ".." name entry."
3865 check_mount_and_prep
3867 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3868 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3870 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3871 echo "The dummy's dotdot name entry references the guard."
3872 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3873 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3874 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3875 error "(3) Fail to mkdir on MDT0"
3876 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3878 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3880 echo "Trigger namespace LFSCK to repair unmatched pairs"
3881 $START_NAMESPACE -A -r ||
3882 error "(5) Fail to start LFSCK for namespace"
3884 wait_all_targets_blocked namespace completed 6
3886 local repaired=$($SHOW_NAMESPACE |
3887 awk '/^unmatched_pairs_repaired/ { print $2 }')
3888 [ $repaired -eq 1 ] ||
3889 error "(7) Fail to repair unmatched pairs: $repaired"
3891 echo "'ls' should success after namespace LFSCK repairing"
3892 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3893 error "(8) ls should success."
3895 run_test 22a "LFSCK can repair unmatched pairs (1)"
3898 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3899 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3900 skip "MDS older than 2.6.50, LU-5511"
3903 echo "The parent_A references the child directory via the name entry_B,"
3904 echo "but the child directory back references another parent_C via its"
3905 echo "".." name entry. The parent_C exists, but there is no the name"
3906 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3907 echo "the child directory's ".." name entry and its linkEA."
3910 check_mount_and_prep
3912 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3913 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3915 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3916 echo "and bad linkEA. The dummy's dotdot name entry references the"
3917 echo "guard. The dummy's linkEA references n non-exist name entry."
3918 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3919 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3920 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3921 error "(3) Fail to mkdir on MDT0"
3922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3924 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3925 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3926 local dummyname=$($LFS fid2path $DIR $dummyfid)
3927 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3928 error "(4) fid2path works unexpectedly."
3930 echo "Trigger namespace LFSCK to repair unmatched pairs"
3931 $START_NAMESPACE -A -r ||
3932 error "(5) Fail to start LFSCK for namespace"
3934 wait_all_targets_blocked namespace completed 6
3936 local repaired=$($SHOW_NAMESPACE |
3937 awk '/^unmatched_pairs_repaired/ { print $2 }')
3938 [ $repaired -eq 1 ] ||
3939 error "(7) Fail to repair unmatched pairs: $repaired"
3941 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3942 local dummyname=$($LFS fid2path $DIR $dummyfid)
3943 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3944 error "(8) fid2path does not work"
3946 run_test 22b "LFSCK can repair unmatched pairs (2)"
3949 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3950 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3951 skip "MDS older than 2.6.50, LU-5512"
3954 echo "The name entry is there, but the MDT-object for such name "
3955 echo "entry does not exist. The namespace LFSCK should find out "
3956 echo "and repair the inconsistency as required."
3959 check_mount_and_prep
3961 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3962 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3964 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3965 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3966 do_facet mds2 $LCTL set_param fail_loc=0x1620
3967 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3968 do_facet mds2 $LCTL set_param fail_loc=0
3970 echo "'ls' should fail because of dangling name entry"
3971 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3973 echo "Trigger namespace LFSCK to find out dangling name entry"
3974 $START_NAMESPACE -A -r ||
3975 error "(5) Fail to start LFSCK for namespace"
3977 wait_all_targets_blocked namespace completed 6
3979 local repaired=$($SHOW_NAMESPACE |
3980 awk '/^dangling_repaired/ { print $2 }')
3981 [ $repaired -eq 1 ] ||
3982 error "(7) Fail to repair dangling name entry: $repaired"
3984 echo "'ls' should fail because not re-create MDT-object by default"
3985 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3987 echo "Trigger namespace LFSCK again to repair dangling name entry"
3988 $START_NAMESPACE -A -r -C ||
3989 error "(9) Fail to start LFSCK for namespace"
3991 wait_all_targets_blocked namespace completed 10
3993 repaired=$($SHOW_NAMESPACE |
3994 awk '/^dangling_repaired/ { print $2 }')
3995 [ $repaired -eq 1 ] ||
3996 error "(11) Fail to repair dangling name entry: $repaired"
3998 echo "'ls' should success after namespace LFSCK repairing"
3999 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
4001 run_test 23a "LFSCK can repair dangling name entry (1)"
4004 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4005 skip "MDS older than 2.6.50, LU-5512"
4008 echo "The objectA has multiple hard links, one of them corresponding"
4009 echo "to the name entry_B. But there is something wrong for the name"
4010 echo "entry_B and cause entry_B to references non-exist object_C."
4011 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4012 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4013 echo "comes to the second-stage scanning, it will find that the"
4014 echo "former re-creating object_C is not proper, and will try to"
4015 echo "replace the object_C with the real object_A."
4018 check_mount_and_prep
4020 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4021 $LFS path2fid $DIR/$tdir/d0
4023 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4025 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4026 $LFS path2fid $DIR/$tdir/d0/f0
4028 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4029 $LFS path2fid $DIR/$tdir/d0/f1
4031 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4032 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4034 if [ "$SEQ0" != "$SEQ1" ]; then
4035 # To guarantee that the f0 and f1 are in the same FID seq
4036 rm -f $DIR/$tdir/d0/f0 ||
4037 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4038 echo "dummy" > $DIR/$tdir/d0/f0 ||
4039 error "(3.2) Fail to touch on MDT0"
4040 $LFS path2fid $DIR/$tdir/d0/f0
4043 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4044 OID=$(printf %d $OID)
4046 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4047 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4048 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4049 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4050 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4052 # If there is creation after the dangling injection, it may re-use
4053 # the just released local object (inode) that is referenced by the
4054 # dangling name entry. It will fail the dangling injection.
4055 # So before deleting the target object for the dangling name entry,
4056 # remove some other objects to avoid the target object being reused
4057 # by some potential creations. LU-7429
4058 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4060 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4062 echo "'ls' should fail because of dangling name entry"
4063 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4064 error "(6) ls should fail."
4066 echo "Trigger namespace LFSCK to find out dangling name entry"
4067 $START_NAMESPACE -r -C ||
4068 error "(7) Fail to start LFSCK for namespace"
4070 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4071 mdd.${MDT_DEV}.lfsck_namespace |
4072 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4074 error "(8) unexpected status"
4077 local repaired=$($SHOW_NAMESPACE |
4078 awk '/^dangling_repaired/ { print $2 }')
4079 [ $repaired -eq 1 ] ||
4080 error "(9) Fail to repair dangling name entry: $repaired"
4082 repaired=$($SHOW_NAMESPACE |
4083 awk '/^multiple_linked_repaired/ { print $2 }')
4084 [ $repaired -eq 1 ] ||
4085 error "(10) Fail to drop the former created object: $repaired"
4087 local data=$(cat $DIR/$tdir/d0/foo)
4088 [ "$data" == "dummy" ] ||
4089 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4091 run_test 23b "LFSCK can repair dangling name entry (2)"
4094 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4095 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4096 mdd.${MDT_DEV}.lfsck_namespace |
4097 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4099 error "(10) unexpected status"
4102 stop_full_debug_logging
4106 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4107 skip "MDS older than 2.6.50, LU-5512"
4110 echo "The objectA has multiple hard links, one of them corresponding"
4111 echo "to the name entry_B. But there is something wrong for the name"
4112 echo "entry_B and cause entry_B to references non-exist object_C."
4113 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4114 echo "as dangling, and re-create the lost object_C. And then others"
4115 echo "modified the re-created object_C. When the LFSCK comes to the"
4116 echo "second-stage scanning, it will find that the former re-creating"
4117 echo "object_C maybe wrong and try to replace the object_C with the"
4118 echo "real object_A. But because object_C has been modified, so the"
4119 echo "LFSCK cannot replace it."
4122 start_full_debug_logging
4123 stack_trap stop_full_debug_logging
4125 check_mount_and_prep
4127 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4128 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4129 echo "parent_fid=$parent_fid"
4131 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4133 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4134 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4135 echo "f0_fid=$f0_fid"
4137 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4138 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4139 echo "f1_fid=$f1_fid"
4141 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4142 # To guarantee that the f0 and f1 are in the same FID seq
4143 rm -f $DIR/$tdir/d0/f0 ||
4144 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4145 echo "dummy" > $DIR/$tdir/d0/f0 ||
4146 error "(3.2) Fail to touch on MDT0"
4147 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4148 echo "f0_fid=$f0_fid (replaced)"
4151 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4153 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4154 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4155 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4156 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4157 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4159 # If there is creation after the dangling injection, it may re-use
4160 # the just released local object (inode) that is referenced by the
4161 # dangling name entry. It will fail the dangling injection.
4162 # So before deleting the target object for the dangling name entry,
4163 # remove some other objects to avoid the target object being reused
4164 # by some potential creations. LU-7429
4165 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4167 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4169 echo "'ls' should fail because of dangling name entry"
4170 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4171 error "(6) ls should fail."
4173 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4174 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4176 echo "Trigger namespace LFSCK to find out dangling name entry"
4177 $START_NAMESPACE -r -C ||
4178 error "(7) Fail to start LFSCK for namespace"
4180 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4181 # While unexpected by the test, it is valid for LFSCK to repair
4182 # the link to the original object before any data is written.
4183 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4185 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4186 log "LFSCK repaired file prematurely"
4191 stat $DIR/$tdir/d0/foo
4193 error "(8) unexpected size"
4196 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4197 cancel_lru_locks osc
4201 local repaired=$($SHOW_NAMESPACE |
4202 awk '/^dangling_repaired/ { print $2 }')
4203 [ $repaired -eq 1 ] ||
4204 error "(11) Fail to repair dangling name entry: $repaired"
4206 local data=$(cat $DIR/$tdir/d0/foo)
4207 [ "$data" != "dummy" ] ||
4208 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4210 run_test 23c "LFSCK can repair dangling name entry (3)"
4213 (( MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
4214 [[ $mds1_FSTYPE == ldiskfs ]] ||
4215 skip "ldiskfs-only test due to a low-level mds fs access"
4218 $LFS mkdir -i 0 $DIR/$tdir/mdt0dir
4219 $LFS mkdir -i 1 $DIR/$tdir/mdt1dir
4221 echo "b-a-r" > $DIR/$tdir/mdt0dir/foo
4222 local foofid=$($LFS path2fid $DIR/$tdir/mdt0dir/foo | sed -E 's/^.(.*).$/\1/')
4224 mv $DIR/$tdir/mdt0dir/foo $DIR/$tdir/mdt1dir/
4228 local devname=$(mdsdevname 1)
4229 local cmd="$DEBUGFS -w -R \\\"rm /REMOTE_PARENT_DIR/${foofid}\\\" $devname"
4230 do_facet mds1 "$cmd"
4232 start mds1 $devname $MDS_MOUNT_OPTS || error "start mds1 failed"
4234 cat $DIR/$tdir/mdt1dir/foo && error "file read should fail"
4236 do_facet mds2 $LCTL lfsck_start -M ${FSNAME}-MDT0001 -t namespace -C ||
4237 error "lfsck namespace failed to start"
4238 wait_update_facet mds2 "$LCTL get_param -n \
4239 mdd.${FSNAME}-MDT0001.lfsck_namespace |
4240 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4241 error " unexpected lfsck status"
4243 cat $DIR/$tdir/mdt1dir/foo || error "file read should succeed"
4245 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -o ||
4246 error "lfsck namespace failed to start"
4248 # lfsck -t layout -o broadcasts all MDTs to perform lfsck layout,
4250 local count=$(do_facet mds1 $LCTL lfsck_query -t layout -w |
4251 awk '/layout_mdts_completed:/ { print $2 }')
4252 (( count != MDSCOUNT )) &&
4253 error "Only $count/$MDSCOUNT lfsck completed"
4255 cmp $DIR/$tdir/mdt1dir/foo <(echo "b-a-r") || error "file body has changed"
4257 run_test 23d "LFSCK can repair a dangling name entry to a remote object"
4260 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4261 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4262 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4263 skip "MDS older than 2.6.50, LU-5513"
4266 echo "Two MDT-objects back reference the same name entry via their"
4267 echo "each own linkEA entry, but the name entry only references one"
4268 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4269 echo "for the MDT-object that is not recognized. If such MDT-object"
4270 echo "has no other linkEA entry after the removing, then the LFSCK"
4271 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4274 check_mount_and_prep
4276 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4278 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4279 $LFS path2fid $DIR/$tdir/d0/guard
4281 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4282 $LFS path2fid $DIR/$tdir/d0/dummy
4285 if [ $mds1_FSTYPE != ldiskfs ]; then
4286 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4288 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4291 touch $DIR/$tdir/d0/guard/foo ||
4292 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4294 echo "Inject failure stub on MDT0 to simulate the case that"
4295 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4296 echo "that references $DIR/$tdir/d0/guard/foo."
4297 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4298 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4299 echo "there with the same linkEA entry as another MDT-object"
4300 echo "$DIR/$tdir/d0/guard/foo has"
4302 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4303 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4304 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4305 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4306 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4307 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4308 rmdir $DIR/$tdir/d0/dummy/foo ||
4309 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4312 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4313 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4314 error "(6) stat successfully unexpectedly"
4316 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4317 $START_NAMESPACE -A -r ||
4318 error "(7) Fail to start LFSCK for namespace"
4320 wait_all_targets_blocked namespace completed 8
4322 local repaired=$($SHOW_NAMESPACE |
4323 awk '/^multiple_referenced_repaired/ { print $2 }')
4324 [ $repaired -eq 1 ] ||
4325 error "(9) Fail to repair multiple referenced name entry: $repaired"
4327 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4328 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4329 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4331 local cname="$cfid-$pfid-D-0"
4332 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4333 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4335 run_test 24 "LFSCK can repair multiple-referenced name entry"
4338 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4339 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4340 skip "MDS older than 2.6.50, LU-5515"
4343 echo "The file type in the name entry does not match the file type"
4344 echo "claimed by the referenced object. Then the LFSCK will update"
4345 echo "the file type in the name entry."
4348 check_mount_and_prep
4350 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4352 echo "Inject failure stub on MDT0 to simulate the case that"
4353 echo "the file type stored in the name entry is wrong."
4355 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4356 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4357 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4358 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4360 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4361 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4363 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4364 mdd.${MDT_DEV}.lfsck_namespace |
4365 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4367 error "(4) unexpected status"
4370 local repaired=$($SHOW_NAMESPACE |
4371 awk '/^bad_file_type_repaired/ { print $2 }')
4372 [ $repaired -eq 1 ] ||
4373 error "(5) Fail to repair bad file type in name entry: $repaired"
4375 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4377 run_test 25 "LFSCK can repair bad file type in the name entry"
4380 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4381 skip "MDS older than 2.6.50, LU-5516"
4384 echo "The local name entry back referenced by the MDT-object is lost."
4385 echo "The namespace LFSCK will add the missing local name entry back"
4386 echo "to the normal namespace."
4389 check_mount_and_prep
4391 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4392 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4393 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4395 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4396 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4398 echo "Inject failure stub on MDT0 to simulate the case that"
4399 echo "foo's name entry will be removed, but the foo's object"
4400 echo "and its linkEA are kept in the system."
4402 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4404 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4407 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4408 error "(5) 'ls' should fail"
4410 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4411 $START_NAMESPACE -r -A ||
4412 error "(6) Fail to start LFSCK for namespace"
4414 wait_all_targets_blocked namespace completed 7
4416 local repaired=$($SHOW_NAMESPACE |
4417 awk '/^lost_dirent_repaired/ { print $2 }')
4418 [ $repaired -eq 1 ] ||
4419 error "(8) Fail to repair lost dirent: $repaired"
4421 ls -ail $DIR/$tdir/d0/foo ||
4422 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4424 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4425 [ "$foofid" == "$foofid2" ] ||
4426 error "(10) foo's FID changed: $foofid, $foofid2"
4428 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4431 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4432 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4433 skip "MDS older than 2.6.50, LU-5516"
4436 echo "The remote name entry back referenced by the MDT-object is lost."
4437 echo "The namespace LFSCK will add the missing remote name entry back"
4438 echo "to the normal namespace."
4441 check_mount_and_prep
4443 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4444 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4445 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4447 echo "Inject failure stub on MDT0 to simulate the case that"
4448 echo "foo's name entry will be removed, but the foo's object"
4449 echo "and its linkEA are kept in the system."
4451 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4452 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4453 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4454 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4456 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4457 error "(4) 'ls' should fail"
4459 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4460 $START_NAMESPACE -r -A ||
4461 error "(5) Fail to start LFSCK for namespace"
4463 wait_all_targets_blocked namespace completed 6
4465 local repaired=$($SHOW_NAMESPACE |
4466 awk '/^lost_dirent_repaired/ { print $2 }')
4467 [ $repaired -eq 1 ] ||
4468 error "(7) Fail to repair lost dirent: $repaired"
4470 ls -ail $DIR/$tdir/d0/foo ||
4471 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4473 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4474 [ "$foofid" == "$foofid2" ] ||
4475 error "(9) foo's FID changed: $foofid, $foofid2"
4477 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4480 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4481 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4482 skip "MDS older than 2.6.50, LU-5516"
4485 echo "The local parent referenced by the MDT-object linkEA is lost."
4486 echo "The namespace LFSCK will re-create the lost parent as orphan."
4489 check_mount_and_prep
4491 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4492 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4493 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4494 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4496 echo "Inject failure stub on MDT0 to simulate the case that"
4497 echo "foo's name entry will be removed, but the foo's object"
4498 echo "and its linkEA are kept in the system. And then remove"
4499 echo "another hard link and the parent directory."
4501 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4503 rm -f $DIR/$tdir/d0/foo ||
4504 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4505 rm -f $DIR/$tdir/d0/dummy ||
4506 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4507 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4509 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4510 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4512 echo "Trigger namespace LFSCK to repair the lost parent"
4513 $START_NAMESPACE -r -A ||
4514 error "(6) Fail to start LFSCK for namespace"
4516 wait_all_targets_blocked namespace completed 7
4518 local repaired=$($SHOW_NAMESPACE |
4519 awk '/^lost_dirent_repaired/ { print $2 }')
4520 [ $repaired -eq 1 ] ||
4521 error "(8) Fail to repair lost dirent: $repaired"
4523 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4524 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4525 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4527 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4529 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4530 [ ! -z "$cname" ] ||
4531 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4533 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4536 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4537 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4538 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4539 skip "MDS older than 2.6.50, LU-5516"
4542 echo "The remote parent referenced by the MDT-object linkEA is lost."
4543 echo "The namespace LFSCK will re-create the lost parent as orphan."
4546 check_mount_and_prep
4548 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4549 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4551 $LFS path2fid $DIR/$tdir/d0
4553 echo "Inject failure stub on MDT0 to simulate the case that"
4554 echo "foo's name entry will be removed, but the foo's object"
4555 echo "and its linkEA are kept in the system. And then remove"
4556 echo "the parent directory."
4558 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4559 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4560 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4561 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4563 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4564 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4566 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4567 $START_NAMESPACE -r -A ||
4568 error "(6) Fail to start LFSCK for namespace"
4570 wait_all_targets_blocked namespace completed 7
4572 local repaired=$($SHOW_NAMESPACE |
4573 awk '/^lost_dirent_repaired/ { print $2 }')
4574 [ $repaired -eq 1 ] ||
4575 error "(8) Fail to repair lost dirent: $repaired"
4577 ls -ail $MOUNT/.lustre/lost+found/
4579 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4580 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4581 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4583 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4585 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4586 [ ! -z "$cname" ] ||
4587 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4589 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4592 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4593 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4594 skip "MDS older than 2.6.50, LU-5506"
4597 echo "The target name entry is lost. The LFSCK should insert the"
4598 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4599 echo "the MDT (on which the orphan MDT-object resides) has ever"
4600 echo "failed to respond some name entry verification during the"
4601 echo "first stage-scanning, then the LFSCK should skip to handle"
4602 echo "orphan MDT-object on this MDT. But other MDTs should not"
4606 check_mount_and_prep
4607 $LFS mkdir -i 0 $DIR/$tdir/d1
4608 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4609 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4611 $LFS mkdir -i 1 $DIR/$tdir/d2
4612 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4613 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4615 echo "Inject failure stub on MDT0 to simulate the case that"
4616 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4617 echo "and its linkEA are kept in the system. And the case that"
4618 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4619 echo "and its linkEA are kept in the system."
4621 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4622 do_facet mds1 $LCTL set_param fail_loc=0x1624
4623 do_facet mds2 $LCTL set_param fail_loc=0x1624
4624 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4625 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4626 do_facet mds1 $LCTL set_param fail_loc=0
4627 do_facet mds2 $LCTL set_param fail_loc=0
4629 cancel_lru_locks mdc
4630 cancel_lru_locks osc
4632 echo "Inject failure, to simulate the MDT0 fail to handle"
4633 echo "MDT1 LFSCK request during the first-stage scanning."
4634 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4635 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4637 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4638 $START_NAMESPACE -r -A ||
4639 error "(3) Fail to start LFSCK for namespace"
4641 wait_update_facet mds1 "$LCTL get_param -n \
4642 mdd.$(facet_svc mds1).lfsck_namespace |
4643 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4644 error "(4) mds1 is not the expected 'partial'"
4647 wait_update_facet mds2 "$LCTL get_param -n \
4648 mdd.$(facet_svc mds2).lfsck_namespace |
4649 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4650 error "(5) mds2 is not the expected 'completed'"
4653 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4655 local repaired=$(do_facet mds1 $LCTL get_param -n \
4656 mdd.$(facet_svc mds1).lfsck_namespace |
4657 awk '/^lost_dirent_repaired/ { print $2 }')
4658 [ $repaired -eq 0 ] ||
4659 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4661 repaired=$(do_facet mds2 $LCTL get_param -n \
4662 mdd.$(facet_svc mds2).lfsck_namespace |
4663 awk '/^lost_dirent_repaired/ { print $2 }')
4664 [ $repaired -eq 1 ] ||
4665 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4667 echo "Trigger namespace LFSCK on all devices again to cleanup"
4668 $START_NAMESPACE -r -A ||
4669 error "(8) Fail to start LFSCK for namespace"
4671 wait_all_targets_blocked namespace completed 9
4673 local repaired=$(do_facet mds1 $LCTL get_param -n \
4674 mdd.$(facet_svc mds1).lfsck_namespace |
4675 awk '/^lost_dirent_repaired/ { print $2 }')
4676 [ $repaired -eq 1 ] ||
4677 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4679 repaired=$(do_facet mds2 $LCTL get_param -n \
4680 mdd.$(facet_svc mds2).lfsck_namespace |
4681 awk '/^lost_dirent_repaired/ { print $2 }')
4682 [ $repaired -eq 0 ] ||
4683 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4685 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4688 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4689 skip "MDS older than 2.6.50, LU-5517"
4692 echo "The object's nlink attribute is larger than the object's known"
4693 echo "name entries count. The LFSCK will repair the object's nlink"
4694 echo "attribute to match the known name entries count"
4697 check_mount_and_prep
4699 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4700 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4702 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4703 echo "nlink attribute is larger than its name entries count."
4705 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4706 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4707 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4708 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4709 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4711 cancel_lru_locks mdc
4712 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4713 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4715 echo "Trigger namespace LFSCK to repair the nlink count"
4716 $START_NAMESPACE -r -A ||
4717 error "(5) Fail to start LFSCK for namespace"
4719 wait_all_targets_blocked namespace completed 6
4721 local repaired=$($SHOW_NAMESPACE |
4722 awk '/^nlinks_repaired/ { print $2 }')
4723 [ $repaired -eq 1 ] ||
4724 error "(7) Fail to repair nlink count: $repaired"
4726 cancel_lru_locks mdc
4727 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4728 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4730 # Disable 29a, we only allow nlink to be updated if the known linkEA
4731 # entries is larger than nlink count.
4733 #run_test 29a "LFSCK can repair bad nlink count (1)"
4736 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4737 skip "MDS older than 2.6.50, LU-5517"
4740 echo "The object's nlink attribute is smaller than the object's known"
4741 echo "name entries count. The LFSCK will repair the object's nlink"
4742 echo "attribute to match the known name entries count"
4745 check_mount_and_prep
4747 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4748 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4750 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4751 echo "nlink attribute is smaller than its name entries count."
4753 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4754 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4755 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4756 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4759 cancel_lru_locks mdc
4760 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4761 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4763 echo "Trigger namespace LFSCK to repair the nlink count"
4764 $START_NAMESPACE -r -A ||
4765 error "(5) Fail to start LFSCK for namespace"
4767 wait_all_targets_blocked namespace completed 6
4769 local repaired=$($SHOW_NAMESPACE |
4770 awk '/^nlinks_repaired/ { print $2 }')
4771 [ $repaired -eq 1 ] ||
4772 error "(7) Fail to repair nlink count: $repaired"
4774 cancel_lru_locks mdc
4775 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4776 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4778 run_test 29b "LFSCK can repair bad nlink count (2)"
4782 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4783 skip "MDS older than 2.6.50, LU-5517"
4786 echo "The namespace LFSCK will create many hard links to the target"
4787 echo "file as to exceed the linkEA size limitation. Under such case"
4788 echo "the linkEA will be marked as overflow that will prevent the"
4789 echo "target file to be migrated. Then remove some hard links to"
4790 echo "make the left hard links to be held within the linkEA size"
4791 echo "limitation. But before the namespace LFSCK adding all the"
4792 echo "missed linkEA entries back, the overflow mark (timestamp)"
4793 echo "will not be cleared."
4796 check_mount_and_prep
4798 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4799 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4800 error "(0.2) Fail to mkdir"
4801 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4802 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4804 # define MAX_LINKEA_SIZE 4096
4805 # sizeof(link_ea_header) = 24
4806 # sizeof(link_ea_entry) = 18
4807 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4808 # (sizeof(link_ea_entry) + name_length))
4809 # If the average name length is 12 bytes, then 150 hard links
4810 # is totally enough to overflow the linkEA
4811 echo "Create 150 hard links should succeed although the linkEA overflow"
4812 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4813 error "(2) Fail to hard link"
4815 cancel_lru_locks mdc
4817 local linked_file_migrate=false
4818 (( $MDS1_VERSION >= $(version_code 2.16.50) )) &&
4819 linked_file_migrate=true
4821 if (( $MDSCOUNT >= 2 )) && $linked_file_migrate; then
4822 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4823 error "(3.1) Migrate should succeed"
4825 echo "The object with linkEA overflow should NOT be migrated"
4826 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4827 [ "$newfid" == "$oldfid" ] ||
4828 error "(3.2) The file with overflowed LinkEA should not migrate: $newfid != $oldfid"
4830 if (( $MDSCOUNT >= 2 )) && ! $linked_file_migrate; then
4831 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4832 error "(3.1) Migrate should fail"
4834 echo "The object with linkEA overflow should NOT be migrated"
4835 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4836 [ "$newfid" == "$oldfid" ] ||
4837 error "(3.2) Migrate should fail: $newfid != $oldfid"
4840 # Remove 100 hard links, then the linkEA should have space
4841 # to hold the missed linkEA entries.
4842 echo "Remove 100 hard links to save space for the missed linkEA entries"
4843 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4845 if (( $MDSCOUNT >= 2 )) && $linked_file_migrate; then
4846 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4847 error "(5.1) Migrate should succeed"
4849 # The overflow timestamp is still there, so migration
4850 # should not migrate the file with LinkEA overflow timestamp
4851 # but migrate only name
4852 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4853 [ "$newfid" == "$oldfid" ] ||
4854 error "(5.2) The file should not migrate: $newfid != $oldfid"
4856 if (( $MDSCOUNT >= 2 )) && ! $linked_file_migrate; then
4857 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4858 error "(5.1) Migrate should fail"
4860 # The overflow timestamp is still there, so migration will fail.
4861 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4862 [ "$newfid" == "$oldfid" ] ||
4863 error "(5.2) Migrate should fail: $newfid != $oldfid"
4866 # sleep 3 seconds to guarantee that the overflow is recognized
4869 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4870 $START_NAMESPACE -r -A ||
4871 error "(6) Fail to start LFSCK for namespace"
4873 wait_all_targets_blocked namespace completed 7
4875 local repaired=$($SHOW_NAMESPACE |
4876 awk '/^linkea_overflow_cleared/ { print $2 }')
4877 [ $repaired -eq 1 ] ||
4878 error "(8) Fail to clear linkea overflow: $repaired"
4880 repaired=$($SHOW_NAMESPACE |
4881 awk '/^nlinks_repaired/ { print $2 }')
4882 [ $repaired -eq 0 ] ||
4883 error "(9) Unexpected nlink repaired: $repaired"
4885 if [ $MDSCOUNT -ge 2 ]; then
4886 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4887 error "(10.1) Migrate failure"
4889 # Migration should succeed after clear the overflow timestamp.
4890 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4891 [ "$newfid" != "$oldfid" ] ||
4892 error "(10.2) Migrate should succeed"
4894 ls -l $DIR/$tdir/foo > /dev/null ||
4895 error "(11) 'ls' failed after migration"
4898 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4899 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4901 run_test 29c "verify linkEA size limitation"
4904 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4905 skip "MDS older than 2.6.50, LU-5517"
4906 [[ $mds1_FSTYPE == ldiskfs ]] || skip "ldiskfs only problem"
4909 echo "The object's nlink attribute is smaller than the object's known"
4910 echo "name entries count. The LFSCK will repair the object's nlink"
4911 echo "attribute to match the known name entries count"
4914 check_mount_and_prep
4916 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4917 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4919 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4920 echo "nlink attribute is smaller than its name entries count."
4922 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4924 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4925 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4927 rm $DIR/$tdir/d0/h1 || error "can't remove link"
4929 cancel_lru_locks mdc
4930 # try to access non-existing inode
4931 stat $DIR/$tdir/d0/foo
4932 touch $DIR/$tdir/d0/foo0 || error "can't create new file"
4934 $LFS rm_entry $DIR/$tdir/d0/foo
4937 run_test 29d "accessing non-existing inode shouldn't turn fs read-only (ldiskfs)"
4940 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4941 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4942 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4943 skip "MDS older than 2.6.50, LU-5518"
4946 echo "The namespace LFSCK will move the orphans from backend"
4947 echo "/lost+found directory to normal client visible namespace"
4948 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4951 check_mount_and_prep
4953 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4954 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4956 echo "Inject failure stub on MDT0 to simulate the case that"
4957 echo "directory d0 has no linkEA entry, then the LFSCK will"
4958 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4960 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4961 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4962 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4963 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4965 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4966 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4968 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4969 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4971 echo "Inject failure stub on MDT0 to simulate the case that the"
4972 echo "object's name entry will be removed, but not destroy the"
4973 echo "object. Then backend e2fsck will handle it as orphan and"
4974 echo "add them into the backend /lost+found directory."
4976 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4978 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4979 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4980 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4981 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4984 umount_client $MOUNT || error "(10) Fail to stop client!"
4986 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4988 local dev=$(facet_device $SINGLEMDS)
4990 echo "run e2fsck on $SINGLEMDS"
4991 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4992 error "(12) Fail to run e2fsck"
4994 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4996 echo "Trigger namespace LFSCK to recover backend orphans"
4997 $START_NAMESPACE -r -A ||
4998 error "(14) Fail to start LFSCK for namespace"
5000 wait_all_targets_blocked namespace completed 15
5002 local repaired=$($SHOW_NAMESPACE |
5003 awk '/^local_lost_found_moved/ { print $2 }')
5004 [ $repaired -ge 4 ] ||
5005 error "(16) Fail to recover backend orphans: $repaired"
5007 mount_client $MOUNT || error "(17) Fail to start client!"
5009 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
5011 ls -ail $MOUNT/.lustre/lost+found/
5013 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
5014 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
5015 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
5017 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
5019 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
5020 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
5022 stat ${cname}/d1 || error "(21) d1 is not recovered"
5023 stat ${cname}/f1 || error "(22) f1 is not recovered"
5025 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
5028 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5029 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5030 skip "MDS older than 2.6.50, LU-5519"
5033 echo "For the name entry under a striped directory, if the name"
5034 echo "hash does not match the shard, then the LFSCK will repair"
5035 echo "the bad name entry"
5038 check_mount_and_prep
5040 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5041 error "(1) Fail to create striped directory"
5043 echo "Inject failure stub on client to simulate the case that"
5044 echo "some name entry should be inserted into other non-first"
5045 echo "shard, but inserted into the first shard by wrong"
5047 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5048 $LCTL set_param fail_loc=0x1628 fail_val=0
5049 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
5050 error "(2) Fail to create file under striped directory"
5051 $LCTL set_param fail_loc=0 fail_val=0
5053 echo "Trigger namespace LFSCK to repair bad name hash"
5054 $START_NAMESPACE -r -A ||
5055 error "(3) Fail to start LFSCK for namespace"
5057 wait_all_targets_blocked namespace completed 4
5059 local repaired=$($SHOW_NAMESPACE |
5060 awk '/^name_hash_repaired/ { print $2 }')
5061 [ $repaired -ge 1 ] ||
5062 error "(5) Fail to repair bad name hash: $repaired"
5064 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
5066 error "Fail to find flag bad type: $rc"
5068 umount_client $MOUNT || error "(6) umount failed"
5069 mount_client $MOUNT || error "(7) mount failed"
5071 for ((i = 0; i < $MDSCOUNT; i++)); do
5072 stat $DIR/$tdir/striped_dir/d$i ||
5073 error "(8) Fail to stat d$i after LFSCK"
5074 rmdir $DIR/$tdir/striped_dir/d$i ||
5075 error "(9) Fail to unlink d$i after LFSCK"
5078 rmdir $DIR/$tdir/striped_dir ||
5079 error "(10) Fail to remove the striped directory after LFSCK"
5081 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
5084 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5085 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5086 skip "MDS older than 2.6.50, LU-5519"
5089 echo "For the name entry under a striped directory, if the name"
5090 echo "hash does not match the shard, then the LFSCK will repair"
5091 echo "the bad name entry"
5094 check_mount_and_prep
5096 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5097 error "(1) Fail to create striped directory"
5099 echo "Inject failure stub on client to simulate the case that"
5100 echo "some name entry should be inserted into other non-second"
5101 echo "shard, but inserted into the secod shard by wrong"
5103 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5104 $LCTL set_param fail_loc=0x1628 fail_val=1
5105 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
5106 error "(2) Fail to create file under striped directory"
5107 $LCTL set_param fail_loc=0 fail_val=0
5109 echo "Trigger namespace LFSCK to repair bad name hash"
5110 $START_NAMESPACE -r -A ||
5111 error "(3) Fail to start LFSCK for namespace"
5113 wait_all_targets_blocked namespace completed 4
5115 local repaired=$(do_facet mds2 $LCTL get_param -n \
5116 mdd.$(facet_svc mds2).lfsck_namespace |
5117 awk '/^name_hash_repaired/ { print $2 }')
5118 echo "repaired $repaired name entries with bad hash"
5119 [ $repaired -ge 1 ] ||
5120 error "(5) Fail to repair bad name hash: $repaired"
5122 umount_client $MOUNT || error "(6) umount failed"
5123 mount_client $MOUNT || error "(7) mount failed"
5125 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5126 stat $DIR/$tdir/striped_dir/d$i ||
5127 error "(8) Fail to stat d$i after LFSCK"
5128 rmdir $DIR/$tdir/striped_dir/d$i ||
5129 error "(9) Fail to unlink d$i after LFSCK"
5132 rmdir $DIR/$tdir/striped_dir ||
5133 error "(10) Fail to remove the striped directory after LFSCK"
5135 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5138 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5139 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5140 skip "MDS older than 2.6.50, LU-5519"
5143 echo "For some reason, the master MDT-object of the striped directory"
5144 echo "may lost its master LMV EA. If nobody created files under the"
5145 echo "master directly after the master LMV EA lost, then the LFSCK"
5146 echo "should re-generate the master LMV EA."
5149 check_mount_and_prep
5151 echo "Inject failure stub on MDT0 to simulate the case that the"
5152 echo "master MDT-object of the striped directory lost the LMV EA."
5154 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5156 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5157 error "(1) Fail to create striped directory"
5158 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5160 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5161 $START_NAMESPACE -r -A ||
5162 error "(2) Fail to start LFSCK for namespace"
5164 wait_all_targets_blocked namespace completed 3
5166 local repaired=$($SHOW_NAMESPACE |
5167 awk '/^striped_dirs_repaired/ { print $2 }')
5168 [ $repaired -eq 1 ] ||
5169 error "(4) Fail to re-generate master LMV EA: $repaired"
5171 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5172 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5174 umount_client $MOUNT || error "(5) umount failed"
5175 mount_client $MOUNT || error "(6) mount failed"
5177 local empty=$(ls $DIR/$tdir/striped_dir/)
5178 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5180 rmdir $DIR/$tdir/striped_dir ||
5181 error "(8) Fail to remove the striped directory after LFSCK"
5183 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5186 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5187 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5188 skip "MDS older than 2.6.50, LU-5519"
5191 echo "For some reason, the master MDT-object of the striped directory"
5192 echo "may lost its master LMV EA. If somebody created files under the"
5193 echo "master directly after the master LMV EA lost, then the LFSCK"
5194 echo "should NOT re-generate the master LMV EA, instead, it should"
5195 echo "change the broken striped dirctory as read-only to prevent"
5196 echo "further damage"
5199 check_mount_and_prep
5201 echo "Inject failure stub on MDT0 to simulate the case that the"
5202 echo "master MDT-object of the striped directory lost the LMV EA."
5204 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5205 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5206 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5207 error "(1) Fail to create striped directory"
5208 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5210 umount_client $MOUNT || error "(2) umount failed"
5212 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5213 mount_client $MOUNT || error "(3) mount failed"
5215 touch $DIR/$tdir/striped_dir/dummy ||
5216 error "(4) Fail to touch under broken striped directory"
5218 echo "Trigger namespace LFSCK to find out the inconsistency"
5219 $START_NAMESPACE -r -A ||
5220 error "(5) Fail to start LFSCK for namespace"
5222 wait_all_targets_blocked namespace completed 6
5224 local repaired=$($SHOW_NAMESPACE |
5225 awk '/^striped_dirs_repaired/ { print $2 }')
5226 [ $repaired -eq 0 ] ||
5227 error "(7) Re-generate master LMV EA unexpected: $repaired"
5229 stat $DIR/$tdir/striped_dir/dummy ||
5230 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5232 touch $DIR/$tdir/striped_dir/foo &&
5233 error "(9) The broken striped directory should be read-only"
5235 chattr -i $DIR/$tdir/striped_dir ||
5236 error "(10) Fail to chattr on the broken striped directory"
5238 rm -f $DIR/$tdir/striped_dir/dummy || error "(11) Fail to remove dummy"
5240 # LFSCK again to regenerate master LMV
5241 echo "Trigger namespace LFSCK to find out the inconsistency"
5242 $START_NAMESPACE -r -A ||
5243 error "(12) Fail to start LFSCK for namespace"
5245 wait_all_targets_blocked namespace completed 6
5247 # reload striped_dir to parse newly generated LMV
5249 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5251 rmdir $DIR/$tdir/striped_dir ||
5252 error "(13) Fail to remove the striped directory after LFSCK"
5254 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5257 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5258 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5259 skip "MDS older than 2.6.50, LU-5519"
5262 echo "For some reason, the slave MDT-object of the striped directory"
5263 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5264 echo "slave LMV EA."
5267 check_mount_and_prep
5269 echo "Inject failure stub on MDT0 to simulate the case that the"
5270 echo "slave MDT-object (that resides on the same MDT as the master"
5271 echo "MDT-object resides on) lost the LMV EA."
5273 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5274 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5275 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5276 error "(1) Fail to create striped directory"
5277 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5279 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5280 $START_NAMESPACE -r -A ||
5281 error "(2) Fail to start LFSCK for namespace"
5283 wait_all_targets_blocked namespace completed 3
5285 local repaired=$($SHOW_NAMESPACE |
5286 awk '/^striped_shards_repaired/ { print $2 }')
5287 [ $repaired -eq 1 ] ||
5288 error "(4) Fail to re-generate slave LMV EA: $repaired"
5290 rmdir $DIR/$tdir/striped_dir ||
5291 error "(5) Fail to remove the striped directory after LFSCK"
5293 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5296 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5297 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5298 skip "MDS older than 2.6.50, LU-5519"
5301 echo "For some reason, the slave MDT-object of the striped directory"
5302 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5303 echo "slave LMV EA."
5306 check_mount_and_prep
5308 echo "Inject failure stub on MDT0 to simulate the case that the"
5309 echo "slave MDT-object (that resides on different MDT as the master"
5310 echo "MDT-object resides on) lost the LMV EA."
5312 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5314 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5315 error "(1) Fail to create striped directory"
5316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5318 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5319 $START_NAMESPACE -r -A ||
5320 error "(2) Fail to start LFSCK for namespace"
5322 wait_all_targets_blocked namespace completed 3
5324 local repaired=$(do_facet mds2 $LCTL get_param -n \
5325 mdd.$(facet_svc mds2).lfsck_namespace |
5326 awk '/^striped_shards_repaired/ { print $2 }')
5327 [ $repaired -eq 1 ] ||
5328 error "(4) Fail to re-generate slave LMV EA: $repaired"
5330 rmdir $DIR/$tdir/striped_dir ||
5331 error "(5) Fail to remove the striped directory after LFSCK"
5333 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5336 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5337 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5338 skip "MDS older than 2.6.50, LU-5519"
5341 echo "For some reason, the stripe index in the slave LMV EA is"
5342 echo "corrupted. The LFSCK should repair the slave LMV EA."
5345 check_mount_and_prep
5347 echo "Inject failure stub on MDT0 to simulate the case that the"
5348 echo "slave LMV EA on the first shard of the striped directory"
5349 echo "claims the same index as the second shard claims"
5351 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5353 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5354 error "(1) Fail to create striped directory"
5355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5357 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5358 $START_NAMESPACE -r -A ||
5359 error "(2) Fail to start LFSCK for namespace"
5361 wait_all_targets_blocked namespace completed 3
5363 local repaired=$($SHOW_NAMESPACE |
5364 awk '/^striped_shards_repaired/ { print $2 }')
5365 [ $repaired -eq 1 ] ||
5366 error "(4) Fail to repair slave LMV EA: $repaired"
5368 umount_client $MOUNT || error "(5) umount failed"
5369 mount_client $MOUNT || error "(6) mount failed"
5371 touch $DIR/$tdir/striped_dir/foo ||
5372 error "(7) Fail to touch file after the LFSCK"
5374 rm -f $DIR/$tdir/striped_dir/foo ||
5375 error "(8) Fail to unlink file after the LFSCK"
5377 rmdir $DIR/$tdir/striped_dir ||
5378 error "(9) Fail to remove the striped directory after LFSCK"
5380 run_test 31g "Repair the corrupted slave LMV EA"
5383 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5384 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5385 skip "MDS older than 2.6.50, LU-5519"
5388 echo "For some reason, the shard's name entry in the striped"
5389 echo "directory may be corrupted. The LFSCK should repair the"
5390 echo "bad shard's name entry."
5393 check_mount_and_prep
5395 echo "Inject failure stub on MDT0 to simulate the case that the"
5396 echo "first shard's name entry in the striped directory claims"
5397 echo "the same index as the second shard's name entry claims."
5399 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5400 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5401 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5402 error "(1) Fail to create striped directory"
5403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5405 echo "Trigger namespace LFSCK to repair the shard's name entry"
5406 $START_NAMESPACE -r -A ||
5407 error "(2) Fail to start LFSCK for namespace"
5409 wait_all_targets_blocked namespace completed 3
5411 local repaired=$($SHOW_NAMESPACE |
5412 awk '/^dirent_repaired/ { print $2 }')
5413 [ $repaired -eq 1 ] ||
5414 error "(4) Fail to repair shard's name entry: $repaired"
5416 umount_client $MOUNT || error "(5) umount failed"
5417 mount_client $MOUNT || error "(6) mount failed"
5419 touch $DIR/$tdir/striped_dir/foo ||
5420 error "(7) Fail to touch file after the LFSCK"
5422 rm -f $DIR/$tdir/striped_dir/foo ||
5423 error "(8) Fail to unlink file after the LFSCK"
5425 rmdir $DIR/$tdir/striped_dir ||
5426 error "(9) Fail to remove the striped directory after LFSCK"
5428 run_test 31h "Repair the corrupted shard's name entry"
5433 umount_client $MOUNT
5435 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5436 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5437 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5439 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5440 [ "$STATUS" == "scanning-phase1" ] ||
5441 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5444 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5446 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5450 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5452 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5453 error "(5) Fail to start ost1"
5455 run_test 32a "stop LFSCK when some OST failed"
5459 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5462 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5463 error "(1) Fail to create $DIR/$tdir/dp"
5464 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5465 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5466 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5467 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5468 umount_client $MOUNT
5470 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5471 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5472 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5474 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5475 mdd.${MDT_DEV}.lfsck_namespace |
5476 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5478 error "(5) unexpected status"
5482 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5488 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5490 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5491 error "(8) Fail to start MDT2"
5493 run_test 32b "stop LFSCK when some MDT failed"
5499 $START_LAYOUT --dryrun -o -r ||
5500 error "(1) Fail to start layout LFSCK"
5501 wait_all_targets_blocked layout completed 2
5503 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5504 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5505 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5507 $START_NAMESPACE -e abort -A -r ||
5508 error "(4) Fail to start namespace LFSCK"
5509 wait_all_targets_blocked namespace completed 5
5511 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5512 [ "$PARAMS" == "failout,all_targets" ] ||
5513 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5515 run_test 33 "check LFSCK paramters"
5519 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5520 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5524 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5525 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5526 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5527 error "(1) Fail to create $DIR/$tdir/dummy"
5529 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5530 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5531 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5532 mdd.${MDT_DEV}.lfsck_namespace |
5533 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5535 error "(3) unexpected status"
5538 local repaired=$($SHOW_NAMESPACE |
5539 awk '/^dirent_repaired/ { print $2 }')
5540 [ $repaired -eq 1 ] ||
5541 error "(4) Fail to repair the lost agent object: $repaired"
5543 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5545 mdd.${MDT_DEV}.lfsck_namespace |
5546 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5548 error "(6) unexpected status"
5551 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5552 [ $repaired -eq 0 ] ||
5553 error "(7) Unexpected repairing: $repaired"
5555 run_test 34 "LFSCK can rebuild the lost agent object"
5559 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5563 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5564 do_facet mds2 $LCTL set_param fail_loc=0x1631
5565 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5566 error "(1) Fail to create $DIR/$tdir/dummy"
5569 do_facet mds2 $LCTL set_param fail_loc=0
5570 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5571 wait_update_facet mds2 "$LCTL get_param -n \
5572 mdd.$(facet_svc mds2).lfsck_namespace |
5573 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5574 error "(3) MDS${k} is not the expected 'completed'"
5576 local repaired=$(do_facet mds2 $LCTL get_param -n \
5577 mdd.$(facet_svc mds2).lfsck_namespace |
5578 awk '/^agent_entries_repaired/ { print $2 }')
5579 [ $repaired -eq 1 ] ||
5580 error "(4) Fail to repair the lost agent entry: $repaired"
5582 echo "stopall to cleanup object cache"
5585 setupall > /dev/null
5587 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5588 wait_update_facet mds2 "$LCTL get_param -n \
5589 mdd.$(facet_svc mds2).lfsck_namespace |
5590 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5591 error "(6) MDS${k} is not the expected 'completed'"
5593 repaired=$(do_facet mds2 $LCTL get_param -n \
5594 mdd.$(facet_svc mds2).lfsck_namespace |
5595 awk '/^agent_entries_repaired/ { print $2 }')
5596 [ $repaired -eq 0 ] ||
5597 error "(7) Unexpected repairing: $repaired"
5599 run_test 35 "LFSCK can rebuild the lost agent entry"
5602 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5605 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5606 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5607 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5610 check_mount_and_prep
5614 lctl get_param osc.*.*grant*
5615 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5617 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5618 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5619 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5620 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5621 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5622 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5623 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5624 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5625 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5626 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5627 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5628 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5630 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5631 error "(3) Fail to write $DIR/$tdir/f0"
5632 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5633 error "(4) Fail to write $DIR/$tdir/f1"
5634 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5635 error "(5) Fail to write $DIR/$tdir/f2"
5637 $LFS mirror resync $DIR/$tdir/f0 ||
5638 error "(6) Fail to resync $DIR/$tdir/f0"
5639 $LFS mirror resync $DIR/$tdir/f1 ||
5640 error "(7) Fail to resync $DIR/$tdir/f1"
5641 $LFS mirror resync $DIR/$tdir/f2 ||
5642 error "(8) Fail to resync $DIR/$tdir/f2"
5644 cancel_lru_locks mdc
5645 cancel_lru_locks osc
5647 $LFS getstripe $DIR/$tdir/f0 ||
5648 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5649 $LFS getstripe $DIR/$tdir/f1 ||
5650 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5651 $LFS getstripe $DIR/$tdir/f2 ||
5652 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5654 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5655 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5656 do_facet mds1 $LCTL set_param fail_loc=0x1616
5658 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5659 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5660 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5661 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5662 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5663 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5667 do_facet mds1 $LCTL set_param fail_loc=0
5669 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5670 error "(15) The 1st of mirror is not destroyed"
5671 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5672 error "(16) The 2nd of mirror is not destroyed"
5673 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5674 error "(17) The 3rd of mirror is not destroyed"
5678 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5679 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5680 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5681 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5682 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5683 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5685 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5686 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5688 for k in $(seq $MDSCOUNT); do
5689 # The LFSCK status query internal is 30 seconds. For the case
5690 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5691 # time to guarantee the status sync up.
5692 wait_update_facet mds${k} "$LCTL get_param -n \
5693 mdd.$(facet_svc mds${k}).lfsck_layout |
5694 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5695 error "(22) MDS${k} is not the expected 'completed'"
5698 for k in $(seq $OSTCOUNT); do
5699 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5700 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5701 awk '/^status/ { print $2 }')
5702 [ "$cur_status" == "completed" ] ||
5703 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5706 local repaired=$(do_facet mds1 $LCTL get_param -n \
5707 mdd.$(facet_svc mds1).lfsck_layout |
5708 awk '/^repaired_orphan/ { print $2 }')
5709 [ $repaired -eq 9 ] ||
5710 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5712 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5713 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5714 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5715 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5716 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5717 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5719 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5720 $LFS getstripe $DIR/$tdir/f0
5721 error "(28) The 1st of mirror is not recovered"
5724 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5725 $LFS getstripe $DIR/$tdir/f1
5726 error "(29) The 2nd of mirror is not recovered"
5729 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5730 $LFS getstripe $DIR/$tdir/f2
5731 error "(30) The 3rd of mirror is not recovered"
5734 run_test 36a "rebuild LOV EA for mirrored file (1)"
5737 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5738 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5741 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5742 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5743 echo "with the PFID EA of related OST-object(s) belong to the file. "
5746 check_mount_and_prep
5748 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5749 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5750 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5751 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5753 local fid=$($LFS path2fid $DIR/$tdir/f0)
5755 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5756 error "(1) Fail to write $DIR/$tdir/f0"
5757 $LFS mirror resync $DIR/$tdir/f0 ||
5758 error "(2) Fail to resync $DIR/$tdir/f0"
5760 cancel_lru_locks mdc
5761 cancel_lru_locks osc
5763 $LFS getstripe $DIR/$tdir/f0 ||
5764 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5766 echo "Inject failure, to simulate the case of missing the MDT-object"
5767 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5768 do_facet mds1 $LCTL set_param fail_loc=0x1616
5769 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5773 do_facet mds1 $LCTL set_param fail_loc=0
5775 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5776 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5778 for k in $(seq $MDSCOUNT); do
5779 # The LFSCK status query internal is 30 seconds. For the case
5780 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5781 # time to guarantee the status sync up.
5782 wait_update_facet mds${k} "$LCTL get_param -n \
5783 mdd.$(facet_svc mds${k}).lfsck_layout |
5784 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5785 error "(6) MDS${k} is not the expected 'completed'"
5788 for k in $(seq $OSTCOUNT); do
5789 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5790 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5791 awk '/^status/ { print $2 }')
5792 [ "$cur_status" == "completed" ] ||
5793 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5796 local count=$(do_facet mds1 $LCTL get_param -n \
5797 mdd.$(facet_svc mds1).lfsck_layout |
5798 awk '/^repaired_orphan/ { print $2 }')
5799 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5801 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5802 count=$($LFS getstripe --mirror-count $name)
5803 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5805 count=$($LFS getstripe --component-count $name)
5806 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5808 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5809 $LFS getstripe $name
5810 error "(11) The 1st of mirror is not recovered"
5813 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5814 $LFS getstripe $name
5815 error "(12) The 2nd of mirror is not recovered"
5818 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5819 $LFS getstripe $name
5820 error "(13) The 3rd of mirror is not recovered"
5823 run_test 36b "rebuild LOV EA for mirrored file (2)"
5826 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5827 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5830 echo "The mirrored file has been modified, not resynced yet, then "
5831 echo "lost its MDT-object, but relatd OST-objects are still there. "
5832 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5833 echo "with the PFID EA of related OST-object(s) belong to the file. "
5836 check_mount_and_prep
5838 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5839 -N -E 2M -S1M -o 1,2 -E -1 -o 0 $DIR/$tdir/f0 ||
5840 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5842 local fid=$($LFS path2fid $DIR/$tdir/f0)
5844 # The 1st dd && resync makes all related OST-objects have been written
5845 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5846 error "(1.1) Fail to write $DIR/$tdir/f0"
5847 $LFS mirror resync $DIR/$tdir/f0 ||
5848 error "(1.2) Fail to resync $DIR/$tdir/f0"
5849 # The 2nd dd makes one mirror to be stale
5850 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5851 error "(1.3) Fail to write $DIR/$tdir/f0"
5853 cancel_lru_locks mdc
5854 cancel_lru_locks osc
5856 $LFS getstripe $DIR/$tdir/f0 ||
5857 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5859 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5860 awk '/lcme_flags/ { print $2 }')
5861 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5862 awk '/lcme_flags/ { print $2 }')
5864 echo "Inject failure, to simulate the case of missing the MDT-object"
5865 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5866 do_facet mds1 $LCTL set_param fail_loc=0x1616
5867 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5871 do_facet mds1 $LCTL set_param fail_loc=0
5873 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5874 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5876 for k in $(seq $MDSCOUNT); do
5877 # The LFSCK status query internal is 30 seconds. For the case
5878 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5879 # time to guarantee the status sync up.
5880 wait_update_facet mds${k} "$LCTL get_param -n \
5881 mdd.$(facet_svc mds${k}).lfsck_layout |
5882 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5883 error "(5) MDS${k} is not the expected 'completed'"
5886 for k in $(seq $OSTCOUNT); do
5887 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5888 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5889 awk '/^status/ { print $2 }')
5890 [ "$cur_status" == "completed" ] ||
5891 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5894 local count=$(do_facet mds1 $LCTL get_param -n \
5895 mdd.$(facet_svc mds1).lfsck_layout |
5896 awk '/^repaired_orphan/ { print $2 }')
5897 [ $count -eq 6 ] || error "(7) Expect 6 fixed on mds1, but got: $count"
5899 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5900 count=$($LFS getstripe --mirror-count $name)
5901 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5903 count=$($LFS getstripe --component-count $name)
5904 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5906 local flags=$($LFS getstripe $name | head -n 10 |
5907 awk '/lcme_flags/ { print $2 }')
5908 [ "$flags" == "$saved_flags1" ] || {
5909 $LFS getstripe $name
5910 error "(10) expect flags $saved_flags1, got $flags"
5913 flags=$($LFS getstripe $name | tail -n 10 |
5914 awk '/lcme_flags/ { print $2 }')
5915 [ "$flags" == "$saved_flags2" ] || {
5916 $LFS getstripe $name
5917 error "(11) expect flags $saved_flags2, got $flags"
5920 run_test 36c "rebuild LOV EA for mirrored file (3)"
5926 local t_dir="$DIR/$tdir/d0"
5927 check_mount_and_prep
5929 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5930 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5934 $START_NAMESPACE -r -A || {
5935 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5937 wait_all_targets_blocked namespace completed 4
5942 run_test 37 "LFSCK must skip a ORPHAN"
5946 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5947 skip "Need MDS version newer than 2.12.51"
5949 # skip basic ops on file with foreign LOV tests on 5.12-6.2 kernels
5950 # until the filemap_read() issue is fixed by v6.2-rc4-61-g5956592ce337
5951 (( $LINUX_VERSION_CODE < $(version_code 5.12.0) ||
5952 $LINUX_VERSION_CODE >= $(version_code 6.2.0) )) ||
5953 skip "Need kernel < 5.12.0 or >= 6.2.0 for filemap_read() fix"
5955 test_mkdir $DIR/$tdir
5956 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5957 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5959 # create foreign file
5960 $LFS setstripe --foreign=none --flags 0xda05 \
5961 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5962 error "$DIR/$tdir/$tfile: create failed"
5964 $LFS getstripe -v $DIR/$tdir/$tfile |
5965 grep "lfm_magic:.*0x0BD70BD0" ||
5966 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5967 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5968 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5969 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5970 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5971 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5972 $LFS getstripe -v $DIR/$tdir/$tfile |
5973 grep "lfm_flags:.*0x0000DA05" ||
5974 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5975 $LFS getstripe $DIR/$tdir/$tfile |
5976 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5977 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5979 # modify striping should fail
5980 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5981 error "$DIR/$tdir/$tfile: setstripe should fail"
5983 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5985 wait_all_targets_blocked namespace completed 1
5987 # check that "global" namespace_repaired == 0 !!!
5988 local repaired=$(do_facet mds1 \
5989 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5990 awk '/^namespace_repaired/ { print \\\$2 }'")
5991 [ $repaired -eq 0 ] ||
5992 error "(2) Expect no namespace repair, but got: $repaired"
5994 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5996 wait_all_targets_blocked layout completed 2
5998 # check that "global" layout_repaired == 0 !!!
5999 local repaired=$(do_facet mds1 \
6000 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6001 awk '/^layout_repaired/ { print \\\$2 }'")
6002 [ $repaired -eq 0 ] ||
6003 error "(2) Expect no layout repair, but got: $repaired"
6005 echo "post-lfsck checks of foreign file"
6007 $LFS getstripe -v $DIR/$tdir/$tfile |
6008 grep "lfm_magic:.*0x0BD70BD0" ||
6009 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
6010 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6011 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
6012 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
6013 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
6014 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
6015 $LFS getstripe -v $DIR/$tdir/$tfile |
6016 grep "lfm_flags:.*0x0000DA05" ||
6017 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
6018 $LFS getstripe $DIR/$tdir/$tfile |
6019 grep "lfm_value:.*${uuid1}@${uuid2}" ||
6020 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
6022 # modify striping should fail
6023 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
6024 error "$DIR/$tdir/$tfile: setstripe should fail"
6026 # R/W should fail, but filemap fix v6.2-rc4-61-g5956592ce337 may be
6027 # missing in some kernels, skip read failure check
6028 cat $DIR/$tdir/$tfile
6029 cat /etc/passwd > $DIR/$tdir/$tfile &&
6030 error "$DIR/$tdir/$tfile: write should fail"
6032 #remove foreign file
6033 rm $DIR/$tdir/$tfile ||
6034 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
6036 run_test 38 "LFSCK does not break foreign file and reverse is also true"
6040 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
6041 skip "Need MDS version newer than 2.12.51"
6043 test_mkdir $DIR/$tdir
6044 local uuid1=$(cat /proc/sys/kernel/random/uuid)
6045 local uuid2=$(cat /proc/sys/kernel/random/uuid)
6047 # create foreign dir
6048 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
6049 $DIR/$tdir/${tdir}2 ||
6050 error "$DIR/$tdir/${tdir}2: create failed"
6052 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6053 grep "lfm_magic:.*0x0CD50CD0" ||
6054 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6055 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6056 # - sizeof(lfm_type) - sizeof(lfm_flags)
6057 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6058 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6059 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6060 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6061 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6062 grep "lfm_flags:.*0x0000DA05" ||
6063 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6064 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6065 grep "lfm_value.*${uuid1}@${uuid2}" ||
6066 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6068 # file create in dir should fail
6069 touch $DIR/$tdir/${tdir}2/$tfile &&
6070 "$DIR/${tdir}2: file create should fail"
6073 chmod 777 $DIR/$tdir/${tdir}2 ||
6074 error "$DIR/${tdir}2: chmod failed"
6077 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6078 error "$DIR/${tdir}2: chown failed"
6080 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
6082 wait_all_targets_blocked namespace completed 1
6084 # check that "global" namespace_repaired == 0 !!!
6085 local repaired=$(do_facet mds1 \
6086 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6087 awk '/^namespace_repaired/ { print \\\$2 }'")
6088 [ $repaired -eq 0 ] ||
6089 error "(2) Expect nothing to be repaired, but got: $repaired"
6091 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
6093 wait_all_targets_blocked layout completed 2
6095 # check that "global" layout_repaired == 0 !!!
6096 local repaired=$(do_facet mds1 \
6097 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6098 awk '/^layout_repaired/ { print \\\$2 }'")
6099 [ $repaired -eq 0 ] ||
6100 error "(2) Expect no layout repair, but got: $repaired"
6102 echo "post-lfsck checks of foreign dir"
6104 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6105 grep "lfm_magic:.*0x0CD50CD0" ||
6106 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6107 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6108 # - sizeof(lfm_type) - sizeof(lfm_flags)
6109 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6110 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6111 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6112 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6113 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6114 grep "lfm_flags:.*0x0000DA05" ||
6115 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6116 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6117 grep "lfm_value.*${uuid1}@${uuid2}" ||
6118 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6120 # file create in dir should fail
6121 touch $DIR/$tdir/${tdir}2/$tfile &&
6122 "$DIR/${tdir}2: file create should fail"
6125 chmod 777 $DIR/$tdir/${tdir}2 ||
6126 error "$DIR/${tdir}2: chmod failed"
6129 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6130 error "$DIR/${tdir}2: chown failed"
6133 rmdir $DIR/$tdir/${tdir}2 ||
6134 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
6136 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
6139 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6141 check_mount_and_prep
6142 $LFS mkdir -i 1 $DIR/$tdir/dir1
6143 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6145 touch $DIR/$tdir/dir1/f1
6146 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6148 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6149 $LFS migrate -m 0 $DIR/$tdir/dir1
6151 echo "trigger LFSCK for layout"
6152 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6154 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6155 mdd.${MDT_DEV}.lfsck_layout |
6156 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6158 error "(2) unexpected status"
6161 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6163 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6165 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6169 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6171 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6172 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6173 do_facet $SINGLEMDS $LCTL dk > /dev/null
6175 echo "trigger LFSCK for SEL layout"
6176 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6178 mdd.${MDT_DEV}.lfsck_layout |
6179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6181 error "(2) unexpected status"
6184 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6185 grep "lfsck_layout_verify_header")
6187 [[ "x$errors" == "x" ]] || {
6189 error "lfsck failed"
6192 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6194 run_test 41 "SEL support in LFSCK"
6197 local mode='\x00\x00\x00\x00'
6198 local raw="$(printf ""\\\\x%02x"" {0..63})"
6202 [[ $(lscpu) =~ Byte\ Order.*Little ]] && size='\x40\x00\x00\x00' ||
6203 size='\x00\x00\x00\x40'
6204 key="${mode}${raw}${size}"
6205 echo -n -e "${key}" | keyctl padd logon fscrypt:4242424242424242 @s
6210 sync ; echo 3 > /proc/sys/vm/drop_caches
6217 $LCTL set_param -n ldlm.namespaces.*.lru_size=clear
6218 sync ; echo 3 > /proc/sys/vm/drop_caches
6219 dummy_key=$(keyctl show | awk '$7 ~ "^fscrypt:" {print $1}')
6220 if [ -n "$dummy_key" ]; then
6221 keyctl revoke $dummy_key
6226 remount_client_normally() {
6227 # remount client without dummy encryption key
6228 if is_mounted $MOUNT; then
6229 umount_client $MOUNT || error "umount $MOUNT failed"
6231 mount_client $MOUNT ${MOUNT_OPTS} ||
6232 error "remount failed"
6234 if is_mounted $MOUNT2; then
6235 umount_client $MOUNT2 || error "umount $MOUNT2 failed"
6237 if [ "$MOUNT_2" ]; then
6238 mount_client $MOUNT2 ${MOUNT_OPTS} ||
6239 error "remount failed"
6245 remount_client_dummykey() {
6248 # remount client with dummy encryption key
6249 if is_mounted $MOUNT; then
6250 umount_client $MOUNT || error "umount $MOUNT failed"
6252 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6253 error "remount failed"
6256 setup_for_enc_tests() {
6257 rm -rf $DIR/[df][0-9]* || error "Fail to cleanup env"
6259 # remount client with test_dummy_encryption option
6260 if is_mounted $MOUNT; then
6261 umount_client $MOUNT || error "umount $MOUNT failed"
6263 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6264 error "mount with '-o test_dummy_encryption' failed"
6266 # this directory will be encrypted, because of dummy mode
6267 $LFS setdirstripe -c 1 -i 0 $DIR/$tdir
6268 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6271 cleanup_for_enc_tests() {
6272 rm -rf $DIR/$tdir $*
6274 remount_client_normally
6278 [[ $(facet_fstype ost1) == zfs ]] && skip "skip ZFS backend"
6280 (( $MDS1_VERSION > $(version_code 2.15.51) )) ||
6281 skip "Need MDS version at least 2.15.51"
6284 echo "If the MDT-object has the encryption flag but the OST-object"
6285 echo "does not, add it to the OST-object."
6288 check_mount_and_prep
6290 $LCTL get_param mdc.*.import | grep -q client_encryption ||
6291 skip "client encryption not supported"
6293 mount.lustre --help |& grep -q "test_dummy_encryption:" ||
6294 skip "need dummy encryption support"
6296 stack_trap cleanup_for_enc_tests EXIT
6299 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6300 touch $DIR/$tdir/${tfile}_1 || error "touch ${tfile}_1 failed"
6301 dd if=/dev/zero of=$DIR/$tdir/${tfile}_2 bs=1 count=1 conv=fsync ||
6302 error "dd ${tfile}_2 failed"
6304 #define OBD_FAIL_LFSCK_NO_ENCFLAG 0x1632
6305 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x1632"
6306 touch $DIR/$tdir/${tfile}_3 || error "touch ${tfile}_3 failed"
6307 dd if=/dev/zero of=$DIR/$tdir/${tfile}_4 bs=1 count=1 conv=fsync ||
6308 error "dd ${tfile}_4 failed"
6309 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x0"
6310 cancel_lru_locks osc
6312 echo "Trigger layout LFSCK to find out inconsistent OST-object enc flag"
6314 $START_LAYOUT -r || error "Fail to start LFSCK for layout!"
6316 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6317 mdd.${MDT_DEV}.lfsck_layout |
6318 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6320 error "unexpected lfsck status"
6323 local repaired=$($SHOW_LAYOUT |
6324 awk '/^repaired_others/ { print $2 }')
6325 [ $repaired -eq 2 ] ||
6326 error "Fail to repair inconsistent enc flag: $repaired"
6328 run_test 42 "LFSCK can repair inconsistent MDT-object/OST-object encryption flags"
6332 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs uses iterate_dir"
6333 [[ $MDSCOUNT -lt 2 ]] && skip "needs >= 2 MDTs"
6335 $LFS mkdir -i 1 -c 2 $DIR/$tdir-{1..10} || error "(1) Fail to mkdir"
6337 remount_facet mds2 "-o abort_recov"
6339 #define OBD_FAIL_OFD_IGET_FAIL_TO_START 0x1e2
6340 do_facet mds2 $LCTL set_param fail_loc=0x1e2
6341 do_facet mds2 $LCTL lfsck_start -M ${FSNAME}-MDT0001 -t namespace
6343 wait_update_facet mds2 \
6344 "$LCTL get_param -n mdd.$(facet_svc mds2).lfsck_namespace |
6345 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6346 error "(5) mds2 is not the expected 'completed'"
6348 wait_osp_import mds1 mds2 FULL
6350 run_test 43 "LFSCK does not loop endlessly on iget failure in scanning-phase1"
6355 #define OBD_FAIL_LFSCK_DELAY1 0x1600
6356 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
6357 $START_NAMESPACE -r || error "(31) Fail to start LFSCK for namespace!"
6361 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
6362 start_facet $SINGLEMDS
6364 local status=$(do_facet mds1 $LCTL get_param \
6365 -n mdd.${MDT_DEV}.lfsck_namespace |
6366 awk '/^status/ { print $2 }')
6367 [ $status == "stopped" ] || {
6369 error "(32) unexpected status"
6372 run_test 44 "umount while lfsck is stopping"
6375 (( $OST1_VERSION >= $(version_code 2.16.53) )) ||
6376 skip "Need OST version at least 2.16.53"
6379 local testfile="$DIR/$tdri/$tfile-0"
6383 $LFS setstripe $testfile -i 0 -c 1 || error "setstripe $testfile failed"
6384 chown $RUNAS_ID.$RUNAS_GID $testfile || error "chown $testfile failed"
6385 is_project_quota_supported && change_project -p $prjid $testfile
6386 $RUNAS $DD of=$testfile count=$cnt || error "write $testfile failed"
6388 cancel_lru_locks osc
6389 sync; sync_all_data || true
6392 echo "check the quota usage after initial write"
6393 usage=$(getquota -u $RUNAS_ID global curspace)
6394 ((usage > cnt * 1024 * 9 / 10)) ||
6395 error "quota USR usage $usage for $RUNAS_ID is wrong"
6397 usage=$(getquota -g $RUNAS_GID global curspace)
6398 ((usage > cnt * 1024 * 9 / 10)) ||
6399 error "quota GRP usage $usage for $RUNAS_GID is wrong"
6401 is_project_quota_supported && {
6402 usage=$(getquota -p $prjid global curspace)
6403 ((usage > cnt * 1024 * 9 / 10)) ||
6404 error "quota PRJ usage $usage for $prjid is wrong"
6407 local fids=($($LFS getstripe $testfile | grep 0x))
6408 local fid="${fids[3]}:${fids[2]}:0"
6409 local objpath=$(ost_fid2_objpath ost1 $fid)
6411 stop ost1 || error "failed to stop ost1"
6413 echo "clear the UID/GID/PROJID of the test file"
6414 mount_fstype ost1 || return 1
6415 do_facet ost1 chown root:root $(facet_mntpt ost1)/$objpath
6416 do_facet ost1 ls -l $(facet_mntpt ost1)/$objpath
6417 is_project_quota_supported && {
6418 do_facet ost1 chattr -p 0 $(facet_mntpt ost1)/$objpath
6419 do_facet ost1 lsattr -p $(facet_mntpt ost1)/$objpath
6423 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
6424 error "failed to start ost1"
6426 echo "check the quota usage after UID/GID/PROJID is cleared"
6427 usage=$(getquota -u $RUNAS_ID global curspace)
6428 ((usage < cnt * 1024 * 1 / 10)) ||
6429 error "quota USR usage $usage for $RUNAS_ID is wrong"
6431 usage=$(getquota -g $RUNAS_GID global curspace)
6432 ((usage < cnt * 1024 * 1 / 10)) ||
6433 error "quota GRP usage $usage for $RUNAS_GID is wrong"
6435 is_project_quota_supported && {
6436 usage=$(getquota -p $prjid global curspace)
6437 ((usage < cnt * 1024 * 1 / 10)) ||
6438 error "quota PRJ usage $usage for $prjid is wrong"
6441 echo "the quota usage should be transferred to root"
6442 usage=$(getquota -u root global curspace)
6443 ((usage > cnt * 1024 * 9 / 10)) ||
6444 error "quota USR usage $usage for root is wrong"
6446 usage=$(getquota -g root global curspace)
6447 ((usage > cnt * 1024 * 9 / 10)) ||
6448 error "quota GRP usage $usage for root is wrong"
6450 is_project_quota_supported && {
6451 usage=$(getquota -p 0 global curspace)
6452 ((usage > cnt * 1024 * 9 / 10)) ||
6453 error "quota PRJ usage $usage for 0 is wrong"
6456 echo "fix the UID/GID/PROJID by LFSCK"
6457 $START_LAYOUT -r -A || error "failed to start LFSCK"
6459 wait_update_facet ost1 \
6460 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
6461 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6463 error "unexpected status of LFSCK on OST1"
6466 echo "the quota usage should be fixed"
6467 usage=$(getquota -u $RUNAS_ID global curspace)
6468 ((usage > cnt * 1024 * 9 / 10)) ||
6469 error "quota USR usage $usage for $RUNAS_ID is wrong"
6471 usage=$(getquota -g $RUNAS_GID global curspace)
6472 ((usage > cnt * 1024 * 9 / 10)) ||
6473 error "quota GRP usage $usage for $RUNAS_GID is wrong"
6475 is_project_quota_supported && {
6476 usage=$(getquota -p $prjid global curspace)
6477 ((usage > cnt * 1024 * 9 / 10)) ||
6478 error "quota PRJ usage $usage for $prjid is wrong"
6481 run_test 45 "LFSCK should fix UID/GID/PROJID of OST object"
6483 # restore MDS/OST size
6484 MDSSIZE=${SAVED_MDSSIZE}
6485 OSTSIZE=${SAVED_OSTSIZE}
6486 OSTCOUNT=${SAVED_OSTCOUNT}
6488 # cleanup the system at last
6489 REFORMAT="yes" cleanup_and_setup_lustre
6491 complete_test $SECONDS
6492 check_and_cleanup_lustre