3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
19 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
21 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
24 require_dsh_mds || exit 0
28 if ! check_versions; then
29 skip "It is NOT necessary to test lfsck under interoperation mode"
33 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
34 skip "Need MDS version at least 2.3.60"
38 SAVED_MDSSIZE=${MDSSIZE}
39 SAVED_OSTSIZE=${OSTSIZE}
40 SAVED_OSTCOUNT=${OSTCOUNT}
41 # use small MDS + OST size to speed formatting time
42 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
46 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
48 # no need too many OSTs, to reduce the format/start/stop overhead
50 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
52 # build up a clean test environment.
53 REFORMAT="yes" check_and_setup_lustre
55 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
56 OST_DEV="${FSNAME}-OST0000"
57 START_NAMESPACE="do_facet $SINGLEMDS \
58 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
59 START_LAYOUT="do_facet $SINGLEMDS \
60 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
61 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
62 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
63 SHOW_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
65 SHOW_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
67 SHOW_LAYOUT_ON_OST="do_facet ost1 \
68 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
69 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
70 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
71 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
111 local dev=$(facet_device $facet)
113 start $facet $dev $opts > /dev/null ||
114 error "($err) Fail to start $facet!"
117 run_e2fsck_on_mds_facet() {
118 [ $mds1_FSTYPE == ldiskfs ] || return 0
122 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
123 local host=$(facet_active_host $mds)
124 local dev=$(facet_device $mds)
126 run_e2fsck $host $dev "-n" |
128 run_e2fsck $host $dev "-n"
129 error "(2) Detected inconsistency on $mds"
131 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
134 wait_all_targets_blocked() {
138 # wait to simulate blocked wait, so that we can know the status
139 local timeout=${4:-600}
140 local lfsck_query="$LCTL lfsck_query -t $com -M $FSNAME-MDT0000"
142 wait_update_facet --quiet mds1 \
143 "$lfsck_query | awk '/^${com}_mdts_$status/ { print \\\$2 }'" \
144 "$MDSCOUNT" $timeout || {
145 local mdts=$(comma_list $(mdts_nodes))
146 local count=$(do_facet mds1 "$lfsck_query" |
147 awk '/^${com}_mdts_$status/ { print $2 }')
149 do_facet mds1 "$lfsck_query"
150 echo "==== MDT LOGS ===="
151 do_nodes $mdts "$LCTL get_param mdd.*.lfsck_$com"
152 do_nodes $mdts "$LCTL get_param osd*.*.oi_scrub"
153 if [[ "$com" == "layout" ]]; then
154 local osts=$(comma_list $(osts_nodes))
155 echo "==== OST LOGS ===="
157 do_nodes $osts "$LCTL get_param obdfilter.*.lfsck_$com"
158 do_nodes $osts "$LCTL get_param osd*.*.oi_scrub"
162 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
171 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
172 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
173 "$MDSCOUNT" $LTIME || {
174 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
175 error "($err) some MDTs are not in ${status}"
182 #define OBD_FAIL_LFSCK_DELAY1 0x1600
183 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
184 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
186 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
188 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
189 [ "$STATUS" == "scanning-phase1" ] ||
190 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
192 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
194 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
195 [ "$STATUS" == "stopped" ] ||
196 error "(6) Expect 'stopped', but got '$STATUS'"
198 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
200 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
201 [ "$STATUS" == "scanning-phase1" ] ||
202 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
205 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
206 mdd.${MDT_DEV}.lfsck_namespace |
207 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
209 error "(9) unexpected status"
212 local repaired=$($SHOW_NAMESPACE |
213 awk '/^updated_phase1/ { print $2 }')
214 [ $repaired -eq 0 ] ||
215 error "(10) Expect nothing to be repaired, but got: $repaired"
217 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
218 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
219 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
220 mdd.${MDT_DEV}.lfsck_namespace |
221 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
223 error "(12) unexpected status"
226 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
227 [ $((scanned1 + 1)) -eq $scanned2 ] ||
228 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
230 echo "stopall, should NOT crash LU-3649"
231 stopall || error "(14) Fail to stopall"
233 run_test 0 "Control LFSCK manually"
238 #define OBD_FAIL_FID_INDIR 0x1501
239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
240 touch $DIR/$tdir/dummy
242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
244 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
245 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
246 mdd.${MDT_DEV}.lfsck_namespace |
247 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
249 error "(4) unexpected status"
252 local repaired=$($SHOW_NAMESPACE |
253 awk '/^dirent_repaired/ { print $2 }')
254 # for interop with old server
255 [ -z "$repaired" ] &&
256 repaired=$($SHOW_NAMESPACE |
257 awk '/^updated_phase1/ { print $2 }')
259 [ $repaired -eq 1 ] ||
260 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
262 run_e2fsck_on_mds_facet $SINGLEMDS
264 mount_client $MOUNT || error "(6) Fail to start client!"
266 #define OBD_FAIL_FID_LOOKUP 0x1505
267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
268 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
272 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
276 [ "$mds1_FSTYPE" != ldiskfs ] &&
277 skip "OI Scrub not implemented for ZFS"
281 #define OBD_FAIL_FID_INLMA 0x1502
282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
283 touch $DIR/$tdir/dummy
285 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
287 #define OBD_FAIL_FID_NOLMA 0x1506
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
289 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
290 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
291 mdd.${MDT_DEV}.lfsck_namespace |
292 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
294 error "(4) unexpected status"
297 local repaired=$($SHOW_NAMESPACE |
298 awk '/^dirent_repaired/ { print $2 }')
299 # for interop with old server
300 [ -z "$repaired" ] &&
301 repaired=$($SHOW_NAMESPACE |
302 awk '/^updated_phase1/ { print $2 }')
304 [ $repaired -eq 1 ] ||
305 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
308 run_e2fsck_on_mds_facet $SINGLEMDS
310 mount_client $MOUNT || error "(6) Fail to start client!"
312 #define OBD_FAIL_FID_LOOKUP 0x1505
313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
314 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
318 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
323 #define OBD_FAIL_FID_IGIF 0x1504
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
325 touch $DIR/$tdir/dummy
327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
329 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
330 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
331 mdd.${MDT_DEV}.lfsck_namespace |
332 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
334 error "(4) unexpected status"
337 local repaired=$($SHOW_NAMESPACE |
338 awk '/^dirent_repaired/ { print $2 }')
339 # for interop with old server
340 [ -z "$repaired" ] &&
341 repaired=$($SHOW_NAMESPACE |
342 awk '/^updated_phase1/ { print $2 }')
344 [ $repaired -eq 1 ] ||
345 error "(5) Fail to repair lost FID-in-dirent: $repaired"
347 run_e2fsck_on_mds_facet $SINGLEMDS
349 mount_client $MOUNT || error "(6) Fail to start client!"
351 #define OBD_FAIL_FID_LOOKUP 0x1505
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
353 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
357 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
362 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
364 touch $DIR/$tdir/dummy
366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
368 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
369 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
370 mdd.${MDT_DEV}.lfsck_namespace |
371 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
373 error "(4) unexpected status"
376 local repaired=$($SHOW_NAMESPACE |
377 awk '/^linkea_repaired/ { print $2 }')
378 # for interop with old server
379 [ -z "$repaired" ] &&
380 repaired=$($SHOW_NAMESPACE |
381 awk '/^updated_phase2/ { print $2 }')
383 [ $repaired -eq 1 ] ||
384 error "(5) Fail to repair crashed linkEA: $repaired"
386 run_e2fsck_on_mds_facet $SINGLEMDS
388 mount_client $MOUNT || error "(6) Fail to start client!"
390 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
391 error "(7) Fail to stat $DIR/$tdir/dummy"
393 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
394 local dummyname=$($LFS fid2path $DIR $dummyfid)
395 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
396 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
398 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
404 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
406 touch $DIR/$tdir/dummy
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
410 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
412 mdd.${MDT_DEV}.lfsck_namespace |
413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
415 error "(4) unexpected status"
418 local repaired=$($SHOW_NAMESPACE |
419 awk '/^updated_phase2/ { print $2 }')
420 [ $repaired -eq 1 ] ||
421 error "(5) Fail to repair crashed linkEA: $repaired"
423 run_e2fsck_on_mds_facet $SINGLEMDS
425 mount_client $MOUNT || error "(6) Fail to start client!"
427 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
428 error "(7) Fail to stat $DIR/$tdir/dummy"
430 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
431 local dummyname=$($LFS fid2path $DIR $dummyfid)
432 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
433 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
435 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
439 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
440 skip "MDS older than 2.4.90"
444 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
445 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
446 touch $DIR/$tdir/dummy
448 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
450 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
451 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
452 mdd.${MDT_DEV}.lfsck_namespace |
453 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
455 error "(4) unexpected status"
458 local repaired=$($SHOW_NAMESPACE |
459 awk '/^updated_phase2/ { print $2 }')
460 [ $repaired -eq 1 ] ||
461 error "(5) Fail to repair crashed linkEA: $repaired"
463 run_e2fsck_on_mds_facet $SINGLEMDS
465 mount_client $MOUNT || error "(6) Fail to start client!"
467 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
468 error "(7) Fail to stat $DIR/$tdir/dummy"
470 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
471 local dummyname=$($LFS fid2path $DIR $dummyfid)
472 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
473 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
475 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
479 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
480 skip "MDS older than 2.6.50, LU-4788"
484 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
485 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
486 touch $DIR/$tdir/dummy
488 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
490 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
491 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
492 mdd.${MDT_DEV}.lfsck_namespace |
493 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
495 error "(4) unexpected status"
498 local repaired=$($SHOW_NAMESPACE |
499 awk '/^linkea_repaired/ { print $2 }')
500 [ $repaired -eq 1 ] ||
501 error "(5) Fail to repair crashed linkEA: $repaired"
503 run_e2fsck_on_mds_facet $SINGLEMDS
505 mount_client $MOUNT || error "(6) Fail to start client!"
507 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
508 error "(7) Fail to stat $DIR/$tdir/dummy"
510 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
511 local dummyname=$($LFS fid2path $DIR $dummyfid)
512 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
513 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
515 run_test 2d "LFSCK can recover the missing linkEA entry"
519 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
520 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
521 skip "MDS older than 2.6.50, LU-5511"
525 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
527 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
528 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
529 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
532 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
534 wait_all_targets_blocked namespace completed 4
536 local repaired=$($SHOW_NAMESPACE |
537 awk '/^linkea_repaired/ { print $2 }')
538 [ $repaired -eq 1 ] ||
539 error "(5) Fail to repair crashed linkEA: $repaired"
541 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
542 local name=$($LFS fid2path $DIR $fid)
543 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
544 error "(6) Fail to repair linkEA: $fid $name"
546 run_test 2e "namespace LFSCK can verify remote object linkEA"
550 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
551 skip "MDS older than 2.6.50, LU-4788"
555 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
556 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
557 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
559 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
560 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
561 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
563 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
564 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
565 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
567 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
569 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
571 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
573 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
574 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
575 mdd.${MDT_DEV}.lfsck_namespace |
576 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
578 error "(10) unexpected status"
581 local checked=$($SHOW_NAMESPACE |
582 awk '/^checked_phase2/ { print $2 }')
583 [ $checked -ge 4 ] ||
584 error "(11) Fail to check multiple-linked object: $checked"
586 local repaired=$($SHOW_NAMESPACE |
587 awk '/^multiple_linked_repaired/ { print $2 }')
588 [ $repaired -ge 2 ] ||
589 error "(12) Fail to repair multiple-linked object: $repaired"
591 run_test 3 "LFSCK can verify multiple-linked objects"
595 [ "$mds1_FSTYPE" != ldiskfs ] &&
596 skip "OI Scrub not implemented for ZFS"
599 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
600 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
602 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
603 echo "start $SINGLEMDS with disabling OI scrub"
604 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
606 #define OBD_FAIL_LFSCK_DELAY2 0x1601
607 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
608 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
609 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
610 mdd.${MDT_DEV}.lfsck_namespace |
611 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
613 error "(5) unexpected status"
616 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
617 [ "$STATUS" == "scanning-phase1" ] ||
618 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
620 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
622 mdd.${MDT_DEV}.lfsck_namespace |
623 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
625 error "(7) unexpected status"
628 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
629 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^dirent_repaired/ { print $2 }')
633 # for interop with old server
634 [ -z "$repaired" ] &&
635 repaired=$($SHOW_NAMESPACE |
636 awk '/^updated_phase1/ { print $2 }')
638 [ $repaired -ge 9 ] ||
639 error "(9) Fail to re-generate FID-in-dirent: $repaired"
641 run_e2fsck_on_mds_facet $SINGLEMDS
643 mount_client $MOUNT || error "(10) Fail to start client!"
645 #define OBD_FAIL_FID_LOOKUP 0x1505
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
647 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
648 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
650 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
654 [ "$mds1_FSTYPE" != ldiskfs ] &&
655 skip "OI Scrub not implemented for ZFS"
658 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
659 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
661 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
662 echo "start $SINGLEMDS with disabling OI scrub"
663 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
665 #define OBD_FAIL_LFSCK_DELAY2 0x1601
666 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
667 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
668 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
669 mdd.${MDT_DEV}.lfsck_namespace |
670 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
672 error "(5) unexpected status"
675 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
676 [ "$STATUS" == "scanning-phase1" ] ||
677 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
680 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
681 mdd.${MDT_DEV}.lfsck_namespace |
682 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
684 error "(7) unexpected status"
687 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
688 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
690 local repaired=$($SHOW_NAMESPACE |
691 awk '/^dirent_repaired/ { print $2 }')
692 # for interop with old server
693 [ -z "$repaired" ] &&
694 repaired=$($SHOW_NAMESPACE |
695 awk '/^updated_phase1/ { print $2 }')
697 [ $repaired -ge 2 ] ||
698 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
700 run_e2fsck_on_mds_facet $SINGLEMDS
702 mount_client $MOUNT || error "(10) Fail to start client!"
704 #define OBD_FAIL_FID_LOOKUP 0x1505
705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
706 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
708 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
710 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
711 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
712 local dummyname=$($LFS fid2path $DIR $dummyfid)
713 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
714 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
716 run_test 5 "LFSCK can handle IGIF object upgrading"
721 #define OBD_FAIL_LFSCK_DELAY1 0x1600
722 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
723 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
725 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
726 [ "$STATUS" == "scanning-phase1" ] ||
727 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
729 # Sleep 3 sec to guarantee at least one object processed by LFSCK
731 # Fail the LFSCK to guarantee there is at least one checkpoint
732 #define OBD_FAIL_LFSCK_FATAL1 0x1608
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
738 error "(4) unexpected status"
741 local POS0=$($SHOW_NAMESPACE |
742 awk '/^last_checkpoint_position/ { print $2 }' |
745 #define OBD_FAIL_LFSCK_DELAY1 0x1600
746 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
747 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
749 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
750 [ "$STATUS" == "scanning-phase1" ] ||
751 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
753 local POS1=$($SHOW_NAMESPACE |
754 awk '/^latest_start_position/ { print $2 }' |
756 [[ $POS0 -lt $POS1 ]] ||
757 error "(7) Expect larger than: $POS0, but got $POS1"
759 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
760 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
761 mdd.${MDT_DEV}.lfsck_namespace |
762 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
764 error "(8) unexpected status"
767 run_test 6a "LFSCK resumes from last checkpoint (1)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 5 sec to guarantee that we are in the directory scanning
782 # Fail the LFSCK to guarantee there is at least one checkpoint
783 #define OBD_FAIL_LFSCK_FATAL2 0x1609
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
785 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
786 mdd.${MDT_DEV}.lfsck_namespace |
787 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
789 error "(4) unexpected status"
792 local O_POS0=$($SHOW_NAMESPACE |
793 awk '/^last_checkpoint_position/ { print $2 }' |
796 local D_POS0=$($SHOW_NAMESPACE |
797 awk '/^last_checkpoint_position/ { print $4 }')
799 #define OBD_FAIL_LFSCK_DELAY2 0x1601
800 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
801 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
803 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
804 [ "$STATUS" == "scanning-phase1" ] ||
805 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
807 local O_POS1=$($SHOW_NAMESPACE |
808 awk '/^latest_start_position/ { print $2 }' |
810 local D_POS1=$($SHOW_NAMESPACE |
811 awk '/^latest_start_position/ { print $4 }')
813 echo "Additional debug for 6b"
815 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
816 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
817 [[ $O_POS0 -lt $O_POS1 ]] ||
818 error "(7.1) $O_POS1 is not larger than $O_POS0"
820 [[ $D_POS0 -lt $D_POS1 ]] ||
821 error "(7.2) $D_POS1 is not larger than $D_POS0"
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
829 error "(8) unexpected status"
832 run_test 6b "LFSCK resumes from last checkpoint (2)"
839 #define OBD_FAIL_LFSCK_DELAY2 0x1601
840 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
841 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
843 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
844 [ "$STATUS" == "scanning-phase1" ] ||
845 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
847 # Sleep 3 sec to guarantee at least one object processed by LFSCK
849 echo "stop $SINGLEMDS"
850 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
853 echo "start $SINGLEMDS"
854 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
856 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
857 mdd.${MDT_DEV}.lfsck_namespace |
858 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
860 error "(6) unexpected status"
863 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
869 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
871 for ((i = 0; i < 20; i++)); do
872 touch $DIR/$tdir/dummy${i}
875 #define OBD_FAIL_LFSCK_DELAY3 0x1602
876 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
877 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
878 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
879 mdd.${MDT_DEV}.lfsck_namespace |
880 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
882 error "(4) unexpected status"
886 echo "stop $SINGLEMDS"
887 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
890 echo "start $SINGLEMDS"
891 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
893 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
894 mdd.${MDT_DEV}.lfsck_namespace |
895 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
897 error "(7) unexpected status"
900 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
911 formatall > /dev/null
917 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
918 [ "$STATUS" == "init" ] ||
919 namespace_error "(2) Expect 'init', but got '$STATUS'"
921 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
923 mkdir $DIR/$tdir/crashed
925 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
927 for ((i = 0; i < 5; i++)); do
928 touch $DIR/$tdir/dummy${i}
931 umount_client $MOUNT || error "(3) Fail to stop client!"
933 #define OBD_FAIL_LFSCK_DELAY2 0x1601
934 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
936 namespace_error "(4) Fail to start LFSCK for namespace!"
938 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
939 [ "$STATUS" == "scanning-phase1" ] ||
940 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
942 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
944 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
945 [ "$STATUS" == "stopped" ] ||
946 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
949 namespace_error "(8) Fail to start LFSCK for namespace!"
951 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
952 [ "$STATUS" == "scanning-phase1" ] ||
953 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
955 #define OBD_FAIL_LFSCK_FATAL2 0x1609
956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
957 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
958 mdd.${MDT_DEV}.lfsck_namespace |
959 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
961 namespace_error "(10) unexpected status"
964 #define OBD_FAIL_LFSCK_DELAY1 0x1600
965 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
967 namespace_error "(11) Fail to start LFSCK for namespace!"
969 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
970 [ "$STATUS" == "scanning-phase1" ] ||
971 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
973 #define OBD_FAIL_LFSCK_CRASH 0x160a
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
977 echo "stop $SINGLEMDS"
978 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
980 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
981 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
983 echo "start $SINGLEMDS"
984 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
986 local timeout=$(max_recovery_time)
989 while [ $timer -lt $timeout ]; do
990 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
991 mdt.${MDT_DEV}.recovery_status |
992 awk '/^status/ { print \\\$2 }'")
993 [ "$STATUS" != "RECOVERING" ] && break;
998 [ $timer != $timeout ] || (
999 do_facet $SINGLEMDS "$LCTL get_param -n \
1000 mdt.${MDT_DEV}.recovery_status"
1001 error "(14.1) recovery timeout"
1004 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1005 [ "$STATUS" == "crashed" ] ||
1006 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1008 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1011 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1012 mdd.${MDT_DEV}.lfsck_namespace |
1013 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
1015 namespace_error "(17) unexpected status"
1018 echo "stop $SINGLEMDS"
1019 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1021 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1024 echo "start $SINGLEMDS"
1025 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1028 while [ $timer -lt $timeout ]; do
1029 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1030 mdt.${MDT_DEV}.recovery_status |
1031 awk '/^status/ { print \\\$2 }'")
1032 [ "$STATUS" != "RECOVERING" ] && break;
1034 timer=$((timer + 1))
1037 [ $timer != $timeout ] || (
1038 do_facet $SINGLEMDS "$LCTL get_param -n \
1039 mdt.${MDT_DEV}.recovery_status"
1040 error "(19.1) recovery timeout"
1043 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1044 [ "$STATUS" == "paused" ] ||
1045 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1047 echo "stop $SINGLEMDS"
1048 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1050 echo "start $SINGLEMDS without resume LFSCK"
1051 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1054 while [ $timer -lt $timeout ]; do
1055 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1056 mdt.${MDT_DEV}.recovery_status |
1057 awk '/^status/ { print \\\$2 }'")
1058 [ "$STATUS" != "RECOVERING" ] && break;
1060 timer=$((timer + 1))
1063 [ $timer != $timeout ] || (
1064 do_facet $SINGLEMDS "$LCTL get_param -n \
1065 mdt.${MDT_DEV}.recovery_status"
1066 error "(20.3) recovery timeout"
1069 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1070 [ "$STATUS" == "paused" ] ||
1071 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1073 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1074 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1077 namespace_error "(21) Fail to start LFSCK for namespace!"
1078 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1079 mdd.${MDT_DEV}.lfsck_namespace |
1080 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1082 namespace_error "(22) unexpected status"
1085 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1086 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1087 mdd.${MDT_DEV}.lfsck_namespace |
1088 awk '/^flags/ { print \\\$2 }'" "scanned-once,inconsistent" 32 || {
1090 namespace_error "(23) Expect 'scanned-once,inconsistent'"
1093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1095 mdd.${MDT_DEV}.lfsck_namespace |
1096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1098 namespace_error "(24) unexpected status"
1101 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1103 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1105 run_test 8 "LFSCK state machine"
1108 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1109 skip "Testing on UP system, the speed may be inaccurate."
1113 check_mount_and_prep
1114 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1115 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1116 createmany -o $DIR/$tdir/lfsck/f 5000
1118 local BASE_SPEED1=100
1120 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1123 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1124 [ "$STATUS" == "scanning-phase1" ] ||
1125 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1127 local SPEED=$($SHOW_LAYOUT |
1128 awk '/^average_speed_phase1/ { print $2 }')
1130 # There may be time error, normally it should be less than 2 seconds.
1131 # We allow another 20% schedule error.
1133 # MAX_MARGIN = 1.3 = 13 / 10
1134 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1135 RUN_TIME1 * 13 / 10))
1136 [ $SPEED -lt $MAX_SPEED ] || {
1138 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1139 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1142 # adjust speed limit
1143 local BASE_SPEED2=300
1145 do_facet $SINGLEMDS \
1146 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1149 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1150 # MIN_MARGIN = 0.7 = 7 / 10
1151 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1152 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1153 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1154 [ $SPEED -gt $MIN_SPEED ] || {
1155 if [ $mds1_FSTYPE != ldiskfs ]; then
1156 error_ignore LU-5624 \
1157 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1160 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1164 # MAX_MARGIN = 1.3 = 13 / 10
1165 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1166 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1167 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1168 [ $SPEED -lt $MAX_SPEED ] || {
1170 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1171 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1172 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1175 do_nodes $(comma_list $(mdts_nodes)) \
1176 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1177 do_nodes $(comma_list $(osts_nodes)) \
1178 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1180 wait_update_facet $SINGLEMDS \
1181 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1182 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1183 error "(7) Failed to get expected 'completed'"
1185 run_test 9a "LFSCK speed control (1)"
1188 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1189 skip "Testing on UP system, the speed may be inaccurate."
1195 echo "Preparing another 50 * 50 files (with error) at $(date)."
1196 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1197 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1198 createmany -d $DIR/$tdir/d 50
1199 createmany -m $DIR/$tdir/f 50
1200 for ((i = 0; i < 50; i++)); do
1201 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1204 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1205 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1206 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1207 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1208 mdd.${MDT_DEV}.lfsck_namespace |
1209 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1211 error "(5) unexpected status"
1214 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1215 echo "Prepared at $(date)."
1217 local BASE_SPEED1=50
1219 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1222 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1223 [ "$STATUS" == "scanning-phase2" ] ||
1224 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1226 local SPEED=$($SHOW_NAMESPACE |
1227 awk '/^average_speed_phase2/ { print $2 }')
1228 # There may be time error, normally it should be less than 2 seconds.
1229 # We allow another 20% schedule error.
1231 # MAX_MARGIN = 1.3 = 13 / 10
1232 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1233 RUN_TIME1 * 13 / 10))
1234 [ $SPEED -lt $MAX_SPEED ] || {
1236 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1237 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1240 # adjust speed limit
1241 local BASE_SPEED2=150
1243 do_facet $SINGLEMDS \
1244 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1247 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1248 # MIN_MARGIN = 0.7 = 7 / 10
1249 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1250 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1251 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1252 [ $SPEED -gt $MIN_SPEED ] || {
1253 if [ $mds1_FSTYPE != ldiskfs ]; then
1254 error_ignore LU-5624 \
1255 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1258 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1262 # MAX_MARGIN = 1.3 = 13 / 10
1263 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1264 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1265 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1266 [ $SPEED -lt $MAX_SPEED ] || {
1268 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1269 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1270 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1273 do_nodes $(comma_list $(mdts_nodes)) \
1274 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1275 do_nodes $(comma_list $(osts_nodes)) \
1276 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1277 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1278 mdd.${MDT_DEV}.lfsck_namespace |
1279 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1281 error "(11) unexpected status"
1284 run_test 9b "LFSCK speed control (2)"
1288 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1292 echo "Preparing more files with error at $(date)."
1293 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1296 for ((i = 0; i < 1000; i = $((i+2)))); do
1297 mkdir -p $DIR/$tdir/d${i}
1298 touch $DIR/$tdir/f${i}
1299 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1302 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1303 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1305 for ((i = 1; i < 1000; i = $((i+2)))); do
1306 mkdir -p $DIR/$tdir/d${i}
1307 touch $DIR/$tdir/f${i}
1308 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1311 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1312 echo "Prepared at $(date)."
1314 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1316 umount_client $MOUNT
1317 mount_client $MOUNT || error "(3) Fail to start client!"
1319 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1322 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1323 [ "$STATUS" == "scanning-phase1" ] ||
1324 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1326 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1328 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1330 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1332 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1334 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1336 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1338 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1340 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1341 error "(14) Fail to softlink!"
1343 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1344 [ "$STATUS" == "scanning-phase1" ] ||
1345 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1347 do_nodes $(comma_list $(mdts_nodes)) \
1348 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1349 do_nodes $(comma_list $(osts_nodes)) \
1350 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1351 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1352 mdd.${MDT_DEV}.lfsck_namespace |
1353 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1355 error "(16) unexpected status"
1358 run_test 10 "System is available during LFSCK scanning"
1361 ost_remove_lastid() {
1364 local rcmd="do_facet ost${ost}"
1366 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1368 # step 1: local mount
1369 mount_fstype ost${ost} || return 1
1370 # step 2: remove the specified LAST_ID
1371 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1373 unmount_fstype ost${ost} || return 2
1377 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1378 skip "MDS older than 2.5.55, LU-1267"
1380 check_mount_and_prep
1381 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1382 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1387 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1389 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1390 error "(2) Fail to start ost1"
1392 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1393 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1395 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1396 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1398 wait_update_facet ost1 "$LCTL get_param -n \
1399 obdfilter.${OST_DEV}.lfsck_layout |
1400 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1402 error "(5) unexpected status"
1405 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1407 wait_update_facet ost1 "$LCTL get_param -n \
1408 obdfilter.${OST_DEV}.lfsck_layout |
1409 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1411 error "(6) unexpected status"
1414 echo "the LAST_ID(s) should have been rebuilt"
1415 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1416 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1418 run_test 11a "LFSCK can rebuild lost last_id"
1421 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1422 skip "MDS older than 2.5.55, LU-1267"
1424 check_mount_and_prep
1425 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1427 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1428 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1429 do_facet ost1 $LCTL set_param fail_loc=0x160d
1431 local count=$(precreated_ost_obj_count 0 0)
1433 createmany -o $DIR/$tdir/f $((count + 32))
1435 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1436 local seq=$(do_facet mds1 $LCTL get_param -n \
1437 osp.${proc_path}.prealloc_last_seq)
1438 local id_used=$(do_facet mds1 $LCTL get_param -n \
1439 osp.${proc_path}.prealloc_last_id)
1441 umount_client $MOUNT
1442 stop ost1 || error "(1) Fail to stop ost1"
1444 #define OBD_FAIL_OST_ENOSPC 0x215
1445 do_facet ost1 $LCTL set_param fail_loc=0x215
1447 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1448 error "(2) Fail to start ost1"
1450 for ((i = 0; i < 60; i++)); do
1451 id_ost1=$(do_facet ost1 \
1452 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1453 awk -F: "/$seq/ { print \$2 }")
1454 [ -n "$id_ost1" ] && break
1458 echo "the on-disk LAST_ID should be smaller than the expected one"
1459 [ $id_used -gt $id_ost1 ] ||
1460 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1462 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1463 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1465 wait_update_facet ost1 \
1466 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1467 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1469 error "(6) unexpected status"
1472 stop ost1 || error "(7) Fail to stop ost1"
1474 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1475 error "(8) Fail to start ost1"
1477 echo "the on-disk LAST_ID should have been rebuilt"
1478 # last_id may be larger than $id_used if objects were created/skipped
1479 wait_update_facet_cond ost1 \
1480 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1481 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1482 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1483 error "(9) expect last_id >= id_used $seq:$id_used"
1486 do_facet ost1 $LCTL set_param fail_loc=0
1487 stopall || error "(10) Fail to stopall"
1489 run_test 11b "LFSCK can rebuild crashed last_id"
1492 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1493 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1494 skip "MDS older than 2.5.55, LU-3950"
1496 check_mount_and_prep
1497 for k in $(seq $MDSCOUNT); do
1498 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1499 createmany -o $DIR/$tdir/${k}/f 100 ||
1500 error "(0) Fail to create 100 files."
1503 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1504 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1505 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1507 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1508 wait_all_targets namespace scanning-phase1 3
1510 echo "Stop namespace LFSCK on all targets by single lctl command."
1511 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1512 error "(4) Fail to stop LFSCK on all devices!"
1514 echo "All the LFSCK targets should be in 'stopped' status."
1515 wait_all_targets_blocked namespace stopped 5
1517 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1518 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1519 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1521 echo "All the LFSCK targets should be in 'completed' status."
1522 wait_all_targets_blocked namespace completed 7
1524 start_full_debug_logging
1526 echo "Start layout LFSCK on all targets by single command (-s 1)."
1527 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1528 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1530 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1531 wait_all_targets layout scanning-phase1 9
1533 echo "Stop layout LFSCK on all targets by single lctl command."
1534 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1535 error "(10) Fail to stop LFSCK on all devices!"
1537 echo "All the LFSCK targets should be in 'stopped' status."
1538 wait_all_targets_blocked layout stopped 11
1540 for k in $(seq $OSTCOUNT); do
1541 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1542 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1543 awk '/^status/ { print $2 }')
1544 [ "$STATUS" == "stopped" ] ||
1545 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1548 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1549 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1550 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1552 echo "All the LFSCK targets should be in 'completed' status."
1553 wait_all_targets_blocked layout completed 14
1555 stop_full_debug_logging
1557 run_test 12a "single command to trigger LFSCK on all devices"
1560 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1561 skip "MDS older than 2.5.55, LU-3950"
1563 check_mount_and_prep
1565 echo "Start LFSCK without '-M' specified."
1566 do_facet mds1 $LCTL lfsck_start -A -r ||
1567 error "(0) Fail to start LFSCK without '-M'"
1569 wait_all_targets_blocked namespace completed 1
1570 wait_all_targets_blocked layout completed 2
1572 local count=$(do_facet mds1 $LCTL dl |
1573 awk '{ print $3 }' | grep mdt | wc -l)
1574 if [ $count -gt 1 ]; then
1576 echo "Start layout LFSCK on the node with multipe targets,"
1577 echo "but not specify '-M'/'-A' option. Should get failure."
1579 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1580 error "(3) Start layout LFSCK should fail" || true
1583 run_test 12b "auto detect Lustre device"
1586 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1587 skip "MDS older than 2.5.55, LU-3593"
1590 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1591 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1592 echo "MDT-object FID."
1595 check_mount_and_prep
1597 echo "Inject failure stub to simulate bad lmm_oi"
1598 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1599 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1600 createmany -o $DIR/$tdir/f 1
1601 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1602 error "(0) Fail to create PFL $DIR/$tdir/f1"
1603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1605 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1606 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1608 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1609 mdd.${MDT_DEV}.lfsck_layout |
1610 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1612 error "(2) unexpected status"
1615 local repaired=$($SHOW_LAYOUT |
1616 awk '/^repaired_others/ { print $2 }')
1617 [ $repaired -eq 2 ] ||
1618 error "(3) Fail to repair crashed lmm_oi: $repaired"
1620 run_test 13 "LFSCK can repair crashed lmm_oi"
1623 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1624 skip "MDS older than 2.5.55, LU-3590"
1627 echo "The OST-object referenced by the MDT-object should be there;"
1628 echo "otherwise, the LFSCK should re-create the missing OST-object."
1629 echo "without '--delay-create-ostobj' option."
1632 check_mount_and_prep
1633 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1635 echo "Inject failure stub to simulate dangling referenced MDT-object"
1636 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1637 do_facet ost1 $LCTL set_param fail_loc=0x1610
1638 local count=$(precreated_ost_obj_count 0 0)
1640 createmany -o $DIR/$tdir/f $((count + 16)) ||
1641 error "(0.1) Fail to create $DIR/$tdir/fx"
1642 touch $DIR/$tdir/guard0
1644 for ((i = 0; i < 16; i++)); do
1645 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1646 $DIR/$tdir/f_comp${i} ||
1647 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1649 touch $DIR/$tdir/guard1
1651 do_facet ost1 $LCTL set_param fail_loc=0
1653 start_full_debug_logging
1655 # exhaust other pre-created dangling cases
1656 count=$(precreated_ost_obj_count 0 0)
1657 createmany -o $DIR/$tdir/a $count ||
1658 error "(0.5) Fail to create $count files."
1660 echo "'ls' should fail because of dangling referenced MDT-object"
1661 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1663 echo "Trigger layout LFSCK to find out dangling reference"
1664 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1667 mdd.${MDT_DEV}.lfsck_layout |
1668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1670 error "(3) unexpected status"
1673 local repaired=$($SHOW_LAYOUT |
1674 awk '/^repaired_dangling/ { print $2 }')
1675 [ $repaired -ge 32 ] ||
1676 error "(4) Fail to repair dangling reference: $repaired"
1678 echo "'stat' should fail because of not repair dangling by default"
1679 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1680 error "(5.1) stat should fail"
1681 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1682 error "(5.2) stat should fail"
1684 echo "Trigger layout LFSCK to repair dangling reference"
1685 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1687 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1688 mdd.${MDT_DEV}.lfsck_layout |
1689 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1691 error "(7) unexpected status"
1694 # There may be some async LFSCK updates in processing, wait for
1695 # a while until the target reparation has been done. LU-4970.
1697 echo "'stat' should success after layout LFSCK repairing"
1698 wait_update_facet client "stat $DIR/$tdir/guard0 |
1699 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1700 stat $DIR/$tdir/guard0
1702 error "(8.1) unexpected size"
1705 wait_update_facet client "stat $DIR/$tdir/guard1 |
1706 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1707 stat $DIR/$tdir/guard1
1709 error "(8.2) unexpected size"
1712 repaired=$($SHOW_LAYOUT |
1713 awk '/^repaired_dangling/ { print $2 }')
1714 [ $repaired -ge 32 ] ||
1715 error "(9) Fail to repair dangling reference: $repaired"
1717 stop_full_debug_logging
1719 echo "stopall to cleanup object cache"
1722 setupall > /dev/null
1724 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1727 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1728 skip "MDS older than 2.5.55, LU-3590"
1731 echo "The OST-object referenced by the MDT-object should be there;"
1732 echo "otherwise, the LFSCK should re-create the missing OST-object."
1733 echo "with '--delay-create-ostobj' option."
1736 check_mount_and_prep
1737 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1739 echo "Inject failure stub to simulate dangling referenced MDT-object"
1740 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1741 do_facet ost1 $LCTL set_param fail_loc=0x1610
1742 local count=$(precreated_ost_obj_count 0 0)
1744 createmany -o $DIR/$tdir/f $((count + 31))
1745 touch $DIR/$tdir/guard
1746 do_facet ost1 $LCTL set_param fail_loc=0
1748 start_full_debug_logging
1750 # exhaust other pre-created dangling cases
1751 count=$(precreated_ost_obj_count 0 0)
1752 createmany -o $DIR/$tdir/a $count ||
1753 error "(0) Fail to create $count files."
1755 echo "'ls' should fail because of dangling referenced MDT-object"
1756 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1758 echo "Trigger layout LFSCK to find out dangling reference"
1759 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1761 wait_all_targets_blocked layout completed 3
1763 local repaired=$($SHOW_LAYOUT |
1764 awk '/^repaired_dangling/ { print $2 }')
1765 [ $repaired -ge 32 ] ||
1766 error "(4) Fail to repair dangling reference: $repaired"
1768 echo "'stat' should fail because of not repair dangling by default"
1769 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1771 echo "Trigger layout LFSCK to repair dangling reference"
1772 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1774 wait_all_targets_blocked layout completed 7
1776 # There may be some async LFSCK updates in processing, wait for
1777 # a while until the target reparation has been done. LU-4970.
1779 echo "'stat' should success after layout LFSCK repairing"
1780 wait_update_facet client "stat $DIR/$tdir/guard |
1781 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1782 stat $DIR/$tdir/guard
1784 error "(8) unexpected size"
1787 repaired=$($SHOW_LAYOUT |
1788 awk '/^repaired_dangling/ { print $2 }')
1789 [ $repaired -ge 32 ] ||
1790 error "(9) Fail to repair dangling reference: $repaired"
1792 stop_full_debug_logging
1794 echo "stopall to cleanup object cache"
1797 setupall > /dev/null
1799 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1802 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1803 skip "MDS older than 2.5.55, LU-3591"
1806 echo "If the OST-object referenced by the MDT-object back points"
1807 echo "to some non-exist MDT-object, then the LFSCK should repair"
1808 echo "the OST-object to back point to the right MDT-object."
1811 check_mount_and_prep
1812 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1814 echo "Inject failure stub to make the OST-object to back point to"
1815 echo "non-exist MDT-object."
1816 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1818 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1819 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1820 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1822 error "(0) Fail to create PFL $DIR/$tdir/f1"
1823 # 'dd' will trigger punch RPC firstly on every OST-objects.
1824 # So even though some OST-object will not be write by 'dd',
1825 # as long as it is allocated (may be NOT allocated in pfl_3b)
1826 # its layout information will be set also.
1827 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1828 cancel_lru_locks osc
1829 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1831 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1832 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1834 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1835 mdd.${MDT_DEV}.lfsck_layout |
1836 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1838 error "(2) unexpected status"
1841 local repaired=$($SHOW_LAYOUT |
1842 awk '/^repaired_unmatched_pair/ { print $2 }')
1843 [ $repaired -ge 3 ] ||
1844 error "(3) Fail to repair unmatched pair: $repaired"
1846 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1849 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1850 skip "MDS older than 2.5.55, LU-3591"
1853 echo "If the OST-object referenced by the MDT-object back points"
1854 echo "to other MDT-object that doesn't recognize the OST-object,"
1855 echo "then the LFSCK should repair it to back point to the right"
1856 echo "MDT-object (the first one)."
1859 check_mount_and_prep
1860 mkdir -p $DIR/$tdir/0
1861 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1862 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1863 cancel_lru_locks osc
1865 echo "Inject failure stub to make the OST-object to back point to"
1866 echo "other MDT-object"
1869 [ $OSTCOUNT -ge 2 ] && stripes=2
1871 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1873 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1874 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1876 error "(0) Fail to create PFL $DIR/$tdir/f1"
1877 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1878 cancel_lru_locks osc
1879 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1881 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1882 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1885 mdd.${MDT_DEV}.lfsck_layout |
1886 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1888 error "(2) unexpected status"
1891 local repaired=$($SHOW_LAYOUT |
1892 awk '/^repaired_unmatched_pair/ { print $2 }')
1893 [ $repaired -eq 4 ] ||
1894 error "(3) Fail to repair unmatched pair: $repaired"
1896 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1899 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1900 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1901 skip "MDS newer than 2.7.55, LU-6475"
1902 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1903 skip "MDS older than 2.5.55, LU-3591"
1906 echo "According to current metadata migration implementation,"
1907 echo "before the old MDT-object is removed, both the new MDT-object"
1908 echo "and old MDT-object will reference the same LOV layout. Then if"
1909 echo "the layout LFSCK finds the new MDT-object by race, it will"
1910 echo "regard related OST-object(s) as multiple referenced case, and"
1911 echo "will try to create new OST-object(s) for the new MDT-object."
1912 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1913 echo "MDT-object before confirm the multiple referenced case."
1916 check_mount_and_prep
1917 $LFS mkdir -i 1 $DIR/$tdir/a1
1918 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1919 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1920 cancel_lru_locks osc
1922 echo "Inject failure stub on MDT1 to delay the migration"
1924 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1925 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1926 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1927 $LFS migrate -m 0 $DIR/$tdir/a1 &
1930 echo "Trigger layout LFSCK to race with the migration"
1931 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1933 wait_all_targets_blocked layout completed 2
1935 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1936 local repaired=$($SHOW_LAYOUT |
1937 awk '/^repaired_unmatched_pair/ { print $2 }')
1938 [ $repaired -eq 1 ] ||
1939 error "(3) Fail to repair unmatched pair: $repaired"
1941 repaired=$($SHOW_LAYOUT |
1942 awk '/^repaired_multiple_referenced/ { print $2 }')
1943 [ $repaired -eq 0 ] ||
1944 error "(4) Unexpectedly repaird multiple references: $repaired"
1946 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1949 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1951 check_mount_and_prep
1953 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1954 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1955 error "setdirstripe failed"
1957 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
1958 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
1960 echo "Migrate $DIR/$tdir to MDT1"
1961 $LFS migrate -m 1 $DIR/$tdir &
1965 # fail sub transactions on random MDTs, which may cause some file
1967 #define OBD_FAIL_OUT_EIO 0x1709
1968 for ((i = 0; i < $MDSCOUNT; i++)); do
1969 do_facet mds$i $LCTL set_param fail_loc=0x1709
1971 do_facet mds$i $LCTL set_param fail_loc=0
1976 # LFSCK can't fully fix migrating directories, and may leave some
1977 # files inaccessible, but it shouldn't cause crash
1978 $START_NAMESPACE -A -r ||
1979 error "Fail to start LFSCK for namespace"
1981 wait_all_targets_blocked namespace completed 1
1983 # resume migration may fail because some file may be inaccessible, but
1984 # it shouldn't cause crash
1985 $LFS migrate -m 1 $DIR/$tdir
1987 # rm $tdir to avoid cleanup failure in the end
1989 $LFS rm_entry $DIR/$tdir/*
1991 REFORMAT="yes" cleanup_and_setup_lustre
1993 run_test 15d "LFSCK don't crash upon dir migration failure"
1996 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1997 skip "MDS older than 2.5.55, LU-3594"
2000 echo "If the OST-object's owner information does not match the owner"
2001 echo "information stored in the MDT-object, then the LFSCK trust the"
2002 echo "MDT-object and update the OST-object's owner information."
2005 check_mount_and_prep
2006 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2007 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2008 cancel_lru_locks osc
2010 # created but no setattr or write to the file.
2012 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2013 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2015 echo "Inject failure stub to skip OST-object owner changing"
2016 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2018 chown 1.1 $DIR/$tdir/f0
2019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2021 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2024 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2026 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2027 mdd.${MDT_DEV}.lfsck_layout |
2028 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2030 error "(2) unexpected status"
2033 local repaired=$($SHOW_LAYOUT |
2034 awk '/^repaired_inconsistent_owner/ { print $2 }')
2035 [ $repaired -eq 1 ] ||
2036 error "(3) Fail to repair inconsistent owner: $repaired"
2038 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2041 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2042 skip "MDS older than 2.5.55, LU-3594"
2045 echo "If more than one MDT-objects reference the same OST-object,"
2046 echo "and the OST-object only recognizes one MDT-object, then the"
2047 echo "LFSCK should create new OST-objects for such non-recognized"
2051 check_mount_and_prep
2052 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2054 echo "Inject failure stub to make two MDT-objects to refernce"
2055 echo "the OST-object"
2057 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2058 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2059 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2060 cancel_lru_locks mdc
2061 cancel_lru_locks osc
2063 createmany -o $DIR/$tdir/f 1
2064 cancel_lru_locks mdc
2065 cancel_lru_locks osc
2067 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2069 error "(0) Fail to create PFL $DIR/$tdir/f1"
2070 cancel_lru_locks mdc
2071 cancel_lru_locks osc
2072 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2074 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2075 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2076 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2077 [ $size -eq 1048576 ] ||
2078 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2080 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2081 [ $size -eq 1048576 ] ||
2082 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2084 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2087 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2089 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2090 mdd.${MDT_DEV}.lfsck_layout |
2091 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2093 error "(3) unexpected status"
2096 local repaired=$($SHOW_LAYOUT |
2097 awk '/^repaired_multiple_referenced/ { print $2 }')
2098 [ $repaired -eq 2 ] ||
2099 error "(4) Fail to repair multiple references: $repaired"
2101 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2102 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2103 error "(5) Fail to write f0."
2104 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2105 [ $size -eq 1048576 ] ||
2106 error "(6) guard size should be 1048576, but got $size"
2108 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2109 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2110 error "(7) Fail to write f1."
2111 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2112 [ $size -eq 1048576 ] ||
2113 error "(8) guard size should be 1048576, but got $size"
2115 run_test 17 "LFSCK can repair multiple references"
2117 $LCTL set_param debug=+cache > /dev/null
2120 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2121 skip "MDS older than 2.5.55, LU-3336"
2124 echo "The target MDT-object is there, but related stripe information"
2125 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2126 echo "layout EA entries."
2129 check_mount_and_prep
2130 $LFS mkdir -i 0 $DIR/$tdir/a1
2131 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2132 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2134 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2136 $LFS path2fid $DIR/$tdir/a1/f1
2137 $LFS getstripe $DIR/$tdir/a1/f1
2139 if [ $MDSCOUNT -ge 2 ]; then
2140 $LFS mkdir -i 1 $DIR/$tdir/a2
2141 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2142 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2143 $LFS path2fid $DIR/$tdir/a2/f2
2144 $LFS getstripe $DIR/$tdir/a2/f2
2147 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2148 error "(0) Fail to create PFL $DIR/$tdir/f3"
2150 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2152 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2154 $LFS path2fid $DIR/$tdir/f3
2155 $LFS getstripe $DIR/$tdir/f3
2157 cancel_lru_locks osc
2159 echo "Inject failure, to make the MDT-object lost its layout EA"
2160 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2161 do_facet mds1 $LCTL set_param fail_loc=0x1615
2162 chown 1.1 $DIR/$tdir/a1/f1
2164 if [ $MDSCOUNT -ge 2 ]; then
2165 do_facet mds2 $LCTL set_param fail_loc=0x1615
2166 chown 1.1 $DIR/$tdir/a2/f2
2169 chown 1.1 $DIR/$tdir/f3
2174 do_facet mds1 $LCTL set_param fail_loc=0
2175 if [ $MDSCOUNT -ge 2 ]; then
2176 do_facet mds2 $LCTL set_param fail_loc=0
2179 cancel_lru_locks mdc
2180 cancel_lru_locks osc
2182 echo "The file size should be incorrect since layout EA is lost"
2183 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2184 [ "$cur_size" != "$saved_size1" ] ||
2185 error "(1) Expect incorrect file1 size"
2187 if [ $MDSCOUNT -ge 2 ]; then
2188 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2189 [ "$cur_size" != "$saved_size1" ] ||
2190 error "(2) Expect incorrect file2 size"
2193 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2194 [ "$cur_size" != "$saved_size2" ] ||
2195 error "(1.2) Expect incorrect file3 size"
2197 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2198 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2200 for k in $(seq $MDSCOUNT); do
2201 # The LFSCK status query internal is 30 seconds. For the case
2202 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2203 # time to guarantee the status sync up.
2204 wait_update_facet mds${k} "$LCTL get_param -n \
2205 mdd.$(facet_svc mds${k}).lfsck_layout |
2206 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2207 error "(4) MDS${k} is not the expected 'completed'"
2210 for k in $(seq $OSTCOUNT); do
2211 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2212 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2213 awk '/^status/ { print $2 }')
2214 [ "$cur_status" == "completed" ] ||
2215 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2218 local repaired=$(do_facet mds1 $LCTL get_param -n \
2219 mdd.$(facet_svc mds1).lfsck_layout |
2220 awk '/^repaired_orphan/ { print $2 }')
2221 [ $repaired -eq 3 ] ||
2222 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2224 if [ $MDSCOUNT -ge 2 ]; then
2225 repaired=$(do_facet mds2 $LCTL get_param -n \
2226 mdd.$(facet_svc mds2).lfsck_layout |
2227 awk '/^repaired_orphan/ { print $2 }')
2228 [ $repaired -eq 2 ] ||
2229 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2232 $LFS path2fid $DIR/$tdir/a1/f1
2233 $LFS getstripe $DIR/$tdir/a1/f1
2235 if [ $MDSCOUNT -ge 2 ]; then
2236 $LFS path2fid $DIR/$tdir/a2/f2
2237 $LFS getstripe $DIR/$tdir/a2/f2
2240 $LFS path2fid $DIR/$tdir/f3
2241 $LFS getstripe $DIR/$tdir/f3
2243 echo "The file size should be correct after layout LFSCK scanning"
2244 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2245 [ "$cur_size" == "$saved_size1" ] ||
2246 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2248 if [ $MDSCOUNT -ge 2 ]; then
2249 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2250 [ "$cur_size" == "$saved_size1" ] ||
2251 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2254 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2255 [ "$cur_size" == "$saved_size2" ] ||
2256 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2258 run_test 18a "Find out orphan OST-object and repair it (1)"
2261 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2262 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2263 skip "MDS older than 2.5.55, LU-3336"
2266 echo "The target MDT-object is lost. The LFSCK should re-create the"
2267 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2268 echo "can move it back to normal namespace manually."
2271 check_mount_and_prep
2272 $LFS mkdir -i 0 $DIR/$tdir/a1
2273 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2274 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2275 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2276 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2278 $LFS getstripe $DIR/$tdir/a1/f1
2280 if [ $MDSCOUNT -ge 2 ]; then
2281 $LFS mkdir -i 1 $DIR/$tdir/a2
2282 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2283 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2284 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2286 $LFS getstripe $DIR/$tdir/a2/f2
2289 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2290 error "(0) Fail to create PFL $DIR/$tdir/f3"
2292 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2294 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2295 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2297 $LFS getstripe $DIR/$tdir/f3
2299 cancel_lru_locks osc
2301 echo "Inject failure, to simulate the case of missing the MDT-object"
2302 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2303 do_facet mds1 $LCTL set_param fail_loc=0x1616
2304 rm -f $DIR/$tdir/a1/f1
2306 if [ $MDSCOUNT -ge 2 ]; then
2307 do_facet mds2 $LCTL set_param fail_loc=0x1616
2308 rm -f $DIR/$tdir/a2/f2
2316 do_facet mds1 $LCTL set_param fail_loc=0
2317 if [ $MDSCOUNT -ge 2 ]; then
2318 do_facet mds2 $LCTL set_param fail_loc=0
2321 cancel_lru_locks mdc
2322 cancel_lru_locks osc
2324 # dryrun mode only check orphans, not repaie
2325 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2326 $START_LAYOUT --dryrun -o -r ||
2327 error "Fail to start layout LFSCK in dryrun mode"
2328 wait_all_targets_blocked layout completed 2
2330 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2331 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2332 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2334 local orphans=$(do_facet mds1 $LCTL get_param -n \
2335 mdd.$(facet_svc mds1).lfsck_layout |
2336 awk '/^inconsistent_orphan/ { print $2 }')
2337 [ $orphans -eq 3 ] ||
2338 error "Expect 3 found on mds1, but got: $orphans"
2340 # orphan parents should not be created
2342 for subdir in $MOUNT/.lustre/lost+found/*; do
2343 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2346 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2347 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2349 for k in $(seq $MDSCOUNT); do
2350 # The LFSCK status query internal is 30 seconds. For the case
2351 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2352 # time to guarantee the status sync up.
2353 wait_update_facet mds${k} "$LCTL get_param -n \
2354 mdd.$(facet_svc mds${k}).lfsck_layout |
2355 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2356 error "(2) MDS${k} is not the expected 'completed'"
2359 for k in $(seq $OSTCOUNT); do
2360 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2361 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2362 awk '/^status/ { print $2 }')
2363 [ "$cur_status" == "completed" ] ||
2364 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2367 local repaired=$(do_facet mds1 $LCTL get_param -n \
2368 mdd.$(facet_svc mds1).lfsck_layout |
2369 awk '/^repaired_orphan/ { print $2 }')
2370 [ $repaired -eq 3 ] ||
2371 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2373 if [ $MDSCOUNT -ge 2 ]; then
2374 repaired=$(do_facet mds2 $LCTL get_param -n \
2375 mdd.$(facet_svc mds2).lfsck_layout |
2376 awk '/^repaired_orphan/ { print $2 }')
2377 [ $repaired -eq 2 ] ||
2378 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2381 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2382 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2383 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2385 if [ $MDSCOUNT -ge 2 ]; then
2386 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2387 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2390 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2391 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2393 $LFS path2fid $DIR/$tdir/a1/f1
2394 $LFS getstripe $DIR/$tdir/a1/f1
2396 if [ $MDSCOUNT -ge 2 ]; then
2397 $LFS path2fid $DIR/$tdir/a2/f2
2398 $LFS getstripe $DIR/$tdir/a2/f2
2401 $LFS path2fid $DIR/$tdir/f3
2402 $LFS getstripe $DIR/$tdir/f3
2404 echo "The file size should be correct after layout LFSCK scanning"
2405 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2406 [ "$cur_size" == "$saved_size1" ] ||
2407 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2409 if [ $MDSCOUNT -ge 2 ]; then
2410 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2411 [ "$cur_size" == "$saved_size1" ] ||
2412 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2415 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2416 [ "$cur_size" == "$saved_size2" ] ||
2417 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2419 run_test 18b "Find out orphan OST-object and repair it (2)"
2422 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2423 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2424 skip "MDS older than 2.5.55, LU-3336"
2427 echo "The target MDT-object is lost, and the OST-object FID is missing."
2428 echo "The LFSCK should re-create the MDT-object with new FID under the "
2429 echo "directory .lustre/lost+found/MDTxxxx."
2432 check_mount_and_prep
2433 $LFS mkdir -i 0 $DIR/$tdir/a1
2434 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2436 echo "Inject failure, to simulate the case of missing parent FID"
2437 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2438 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2440 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2441 $LFS getstripe $DIR/$tdir/a1/f1
2443 if [ $MDSCOUNT -ge 2 ]; then
2444 $LFS mkdir -i 1 $DIR/$tdir/a2
2445 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2446 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2447 $LFS getstripe $DIR/$tdir/a2/f2
2450 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2451 error "(0) Fail to create PFL $DIR/$tdir/f3"
2453 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2454 $LFS getstripe $DIR/$tdir/f3
2456 cancel_lru_locks osc
2457 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2459 echo "Inject failure, to simulate the case of missing the MDT-object"
2460 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2461 do_facet mds1 $LCTL set_param fail_loc=0x1616
2462 rm -f $DIR/$tdir/a1/f1
2464 if [ $MDSCOUNT -ge 2 ]; then
2465 do_facet mds2 $LCTL set_param fail_loc=0x1616
2466 rm -f $DIR/$tdir/a2/f2
2474 do_facet mds1 $LCTL set_param fail_loc=0
2475 if [ $MDSCOUNT -ge 2 ]; then
2476 do_facet mds2 $LCTL set_param fail_loc=0
2479 cancel_lru_locks mdc
2480 cancel_lru_locks osc
2482 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2483 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2485 for k in $(seq $MDSCOUNT); do
2486 # The LFSCK status query internal is 30 seconds. For the case
2487 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2488 # time to guarantee the status sync up.
2489 wait_update_facet mds${k} "$LCTL get_param -n \
2490 mdd.$(facet_svc mds${k}).lfsck_layout |
2491 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2492 error "(2) MDS${k} is not the expected 'completed'"
2495 for k in $(seq $OSTCOUNT); do
2496 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2497 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2498 awk '/^status/ { print $2 }')
2499 [ "$cur_status" == "completed" ] ||
2500 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2503 if [ $MDSCOUNT -ge 2 ]; then
2509 local repaired=$(do_facet mds1 $LCTL get_param -n \
2510 mdd.$(facet_svc mds1).lfsck_layout |
2511 awk '/^repaired_orphan/ { print $2 }')
2512 [ $repaired -eq $expected ] ||
2513 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2515 if [ $MDSCOUNT -ge 2 ]; then
2516 repaired=$(do_facet mds2 $LCTL get_param -n \
2517 mdd.$(facet_svc mds2).lfsck_layout |
2518 awk '/^repaired_orphan/ { print $2 }')
2519 [ $repaired -eq 0 ] ||
2520 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2523 ls -ail $MOUNT/.lustre/lost+found/
2525 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2526 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2527 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2529 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2532 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2533 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2534 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2536 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2537 [ ! -z "$cname" ] ||
2538 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2540 run_test 18c "Find out orphan OST-object and repair it (3)"
2543 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2544 skip "MDS older than 2.5.55, LU-3336"
2547 echo "The target MDT-object layout EA is corrupted, but the right"
2548 echo "OST-object is still alive as orphan. The layout LFSCK will"
2549 echo "not create new OST-object to occupy such slot."
2552 check_mount_and_prep
2554 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2555 echo "guard" > $DIR/$tdir/a1/f1
2556 echo "foo" > $DIR/$tdir/a1/f2
2558 echo "guard" > $DIR/$tdir/a1/f3
2559 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2560 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2561 echo "foo" > $DIR/$tdir/a1/f4
2563 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2564 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2565 $LFS path2fid $DIR/$tdir/a1/f1
2566 $LFS getstripe $DIR/$tdir/a1/f1
2567 $LFS path2fid $DIR/$tdir/a1/f2
2568 $LFS getstripe $DIR/$tdir/a1/f2
2569 $LFS path2fid $DIR/$tdir/a1/f3
2570 $LFS getstripe $DIR/$tdir/a1/f3
2571 $LFS path2fid $DIR/$tdir/a1/f4
2572 $LFS getstripe $DIR/$tdir/a1/f4
2573 cancel_lru_locks osc
2575 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2576 echo "to reference the same OST-object (which is f1's OST-obejct)."
2577 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2578 echo "dangling reference case, but f2's old OST-object is there."
2580 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2581 echo "to reference the same OST-object (which is f3's OST-obejct)."
2582 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2583 echo "dangling reference case, but f4's old OST-object is there."
2586 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2588 chown 1.1 $DIR/$tdir/a1/f2
2589 chown 1.1 $DIR/$tdir/a1/f4
2590 rm -f $DIR/$tdir/a1/f1
2591 rm -f $DIR/$tdir/a1/f3
2594 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2596 echo "stopall to cleanup object cache"
2599 setupall > /dev/null
2601 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2602 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2604 for k in $(seq $MDSCOUNT); do
2605 # The LFSCK status query internal is 30 seconds. For the case
2606 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2607 # time to guarantee the status sync up.
2608 wait_update_facet mds${k} "$LCTL get_param -n \
2609 mdd.$(facet_svc mds${k}).lfsck_layout |
2610 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2611 error "(3) MDS${k} is not the expected 'completed'"
2614 for k in $(seq $OSTCOUNT); do
2615 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2616 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2617 awk '/^status/ { print $2 }')
2618 [ "$cur_status" == "completed" ] ||
2619 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2622 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2623 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2624 awk '/^repaired_orphan/ { print $2 }')
2625 [ $repaired -eq 2 ] ||
2626 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2628 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2629 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2630 awk '/^repaired_dangling/ { print $2 }')
2631 [ $repaired -eq 0 ] ||
2632 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2634 echo "The file size should be correct after layout LFSCK scanning"
2635 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2636 [ "$cur_size" == "$saved_size1" ] ||
2637 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2639 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2640 [ "$cur_size" == "$saved_size2" ] ||
2641 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2643 echo "The LFSCK should find back the original data."
2644 cat $DIR/$tdir/a1/f2
2645 $LFS path2fid $DIR/$tdir/a1/f2
2646 $LFS getstripe $DIR/$tdir/a1/f2
2647 cat $DIR/$tdir/a1/f4
2648 $LFS path2fid $DIR/$tdir/a1/f4
2649 $LFS getstripe $DIR/$tdir/a1/f4
2651 run_test 18d "Find out orphan OST-object and repair it (4)"
2654 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2655 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2656 skip "MDS older than 2.5.55, LU-3336"
2659 echo "The target MDT-object layout EA slot is occpuied by some new"
2660 echo "created OST-object when repair dangling reference case. Such"
2661 echo "conflict OST-object has been modified by others. To keep the"
2662 echo "new data, the LFSCK will create a new file to refernece this"
2663 echo "old orphan OST-object."
2666 check_mount_and_prep
2668 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2669 echo "guard" > $DIR/$tdir/a1/f1
2670 echo "foo" > $DIR/$tdir/a1/f2
2672 echo "guard" > $DIR/$tdir/a1/f3
2673 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2674 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2675 echo "foo" > $DIR/$tdir/a1/f4
2677 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2678 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2680 $LFS path2fid $DIR/$tdir/a1/f1
2681 $LFS getstripe $DIR/$tdir/a1/f1
2682 $LFS path2fid $DIR/$tdir/a1/f2
2683 $LFS getstripe $DIR/$tdir/a1/f2
2684 $LFS path2fid $DIR/$tdir/a1/f3
2685 $LFS getstripe $DIR/$tdir/a1/f3
2686 $LFS path2fid $DIR/$tdir/a1/f4
2687 $LFS getstripe $DIR/$tdir/a1/f4
2688 cancel_lru_locks osc
2690 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2691 echo "to reference the same OST-object (which is f1's OST-obejct)."
2692 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2693 echo "dangling reference case, but f2's old OST-object is there."
2695 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2696 echo "to reference the same OST-object (which is f3's OST-obejct)."
2697 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2698 echo "dangling reference case, but f4's old OST-object is there."
2701 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2702 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2703 chown 1.1 $DIR/$tdir/a1/f2
2704 chown 1.1 $DIR/$tdir/a1/f4
2705 rm -f $DIR/$tdir/a1/f1
2706 rm -f $DIR/$tdir/a1/f3
2709 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2711 echo "stopall to cleanup object cache"
2714 setupall > /dev/null
2716 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2717 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2719 start_full_debug_logging
2721 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2722 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2724 wait_update_facet mds1 "$LCTL get_param -n \
2725 mdd.$(facet_svc mds1).lfsck_layout |
2726 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2727 error "(3) MDS1 is not the expected 'scanning-phase2'"
2729 # to guarantee all updates are synced.
2733 echo "Write new data to f2/f4 to modify the new created OST-object."
2734 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2735 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2737 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2739 for k in $(seq $MDSCOUNT); do
2740 # The LFSCK status query internal is 30 seconds. For the case
2741 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2742 # time to guarantee the status sync up.
2743 wait_update_facet mds${k} "$LCTL get_param -n \
2744 mdd.$(facet_svc mds${k}).lfsck_layout |
2745 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2746 error "(4) MDS${k} is not the expected 'completed'"
2749 for k in $(seq $OSTCOUNT); do
2750 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2751 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2752 awk '/^status/ { print $2 }')
2753 [ "$cur_status" == "completed" ] ||
2754 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2757 stop_full_debug_logging
2759 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2760 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2761 awk '/^repaired_orphan/ { print $2 }')
2762 [ $repaired -eq 2 ] ||
2763 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2765 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2766 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2767 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2769 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2770 if [ $count -ne 2 ]; then
2771 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2772 error "(8) Expect 2 stubs under lost+found, but got $count"
2775 echo "The stub file should keep the original f2 or f4 data"
2776 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2777 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2778 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2779 error "(9) Got unexpected $cur_size"
2782 $LFS path2fid $cname
2783 $LFS getstripe $cname
2785 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2786 cur_size=$(ls -il $cname | awk '{ print $6 }')
2787 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2788 error "(10) Got unexpected $cur_size"
2791 $LFS path2fid $cname
2792 $LFS getstripe $cname
2794 echo "The f2/f4 should contains new data."
2795 cat $DIR/$tdir/a1/f2
2796 $LFS path2fid $DIR/$tdir/a1/f2
2797 $LFS getstripe $DIR/$tdir/a1/f2
2798 cat $DIR/$tdir/a1/f4
2799 $LFS path2fid $DIR/$tdir/a1/f4
2800 $LFS getstripe $DIR/$tdir/a1/f4
2802 run_test 18e "Find out orphan OST-object and repair it (5)"
2805 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2808 echo "The target MDT-object is lost. The LFSCK should re-create the"
2809 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2810 echo "to verify some OST-object(s) during the first stage-scanning,"
2811 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2812 echo "should not be affected."
2815 check_mount_and_prep
2816 $LFS mkdir -i 0 $DIR/$tdir/a1
2817 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2818 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2819 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2820 $LFS mkdir -i 0 $DIR/$tdir/a2
2821 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2822 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2823 $LFS getstripe $DIR/$tdir/a1/f1
2824 $LFS getstripe $DIR/$tdir/a2/f2
2826 if [ $MDSCOUNT -ge 2 ]; then
2827 $LFS mkdir -i 1 $DIR/$tdir/a3
2828 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2829 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2830 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2831 $LFS mkdir -i 1 $DIR/$tdir/a4
2832 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2833 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2834 $LFS getstripe $DIR/$tdir/a3/f3
2835 $LFS getstripe $DIR/$tdir/a4/f4
2838 cancel_lru_locks osc
2840 echo "Inject failure, to simulate the case of missing the MDT-object"
2841 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2842 do_facet mds1 $LCTL set_param fail_loc=0x1616
2843 rm -f $DIR/$tdir/a1/f1
2844 rm -f $DIR/$tdir/a2/f2
2846 if [ $MDSCOUNT -ge 2 ]; then
2847 do_facet mds2 $LCTL set_param fail_loc=0x1616
2848 rm -f $DIR/$tdir/a3/f3
2849 rm -f $DIR/$tdir/a4/f4
2855 do_facet mds1 $LCTL set_param fail_loc=0
2856 if [ $MDSCOUNT -ge 2 ]; then
2857 do_facet mds2 $LCTL set_param fail_loc=0
2860 cancel_lru_locks mdc
2861 cancel_lru_locks osc
2863 echo "Inject failure, to simulate the OST0 fail to handle"
2864 echo "MDT0 LFSCK request during the first-stage scanning."
2865 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2866 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2868 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2869 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2871 for k in $(seq $MDSCOUNT); do
2872 # The LFSCK status query internal is 30 seconds. For the case
2873 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2874 # time to guarantee the status sync up.
2875 wait_update_facet mds${k} "$LCTL get_param -n \
2876 mdd.$(facet_svc mds${k}).lfsck_layout |
2877 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2878 error "(2) MDS${k} is not the expected 'partial'"
2881 wait_update_facet ost1 "$LCTL get_param -n \
2882 obdfilter.$(facet_svc ost1).lfsck_layout |
2883 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2884 error "(3) OST1 is not the expected 'partial'"
2887 wait_update_facet ost2 "$LCTL get_param -n \
2888 obdfilter.$(facet_svc ost2).lfsck_layout |
2889 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2890 error "(4) OST2 is not the expected 'completed'"
2893 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2895 local repaired=$(do_facet mds1 $LCTL get_param -n \
2896 mdd.$(facet_svc mds1).lfsck_layout |
2897 awk '/^repaired_orphan/ { print $2 }')
2898 [ $repaired -eq 1 ] ||
2899 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2901 if [ $MDSCOUNT -ge 2 ]; then
2902 repaired=$(do_facet mds2 $LCTL get_param -n \
2903 mdd.$(facet_svc mds2).lfsck_layout |
2904 awk '/^repaired_orphan/ { print $2 }')
2905 [ $repaired -eq 1 ] ||
2906 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2909 echo "Trigger layout LFSCK on all devices again to cleanup"
2910 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2912 for k in $(seq $MDSCOUNT); do
2913 # The LFSCK status query internal is 30 seconds. For the case
2914 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2915 # time to guarantee the status sync up.
2916 wait_update_facet mds${k} "$LCTL get_param -n \
2917 mdd.$(facet_svc mds${k}).lfsck_layout |
2918 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2919 error "(8) MDS${k} is not the expected 'completed'"
2922 for k in $(seq $OSTCOUNT); do
2923 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2924 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2925 awk '/^status/ { print $2 }')
2926 [ "$cur_status" == "completed" ] ||
2927 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2931 local repaired=$(do_facet mds1 $LCTL get_param -n \
2932 mdd.$(facet_svc mds1).lfsck_layout |
2933 awk '/^repaired_orphan/ { print $2 }')
2934 [ $repaired -eq 2 ] ||
2935 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2937 if [ $MDSCOUNT -ge 2 ]; then
2938 repaired=$(do_facet mds2 $LCTL get_param -n \
2939 mdd.$(facet_svc mds2).lfsck_layout |
2940 awk '/^repaired_orphan/ { print $2 }')
2941 [ $repaired -eq 2 ] ||
2942 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2945 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2948 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2951 echo "The target MDT-object is lost, but related OI mapping is there"
2952 echo "The LFSCK should recreate the lost MDT-object without affected"
2953 echo "by the stale OI mapping."
2956 check_mount_and_prep
2957 $LFS mkdir -i 0 $DIR/$tdir/a1
2958 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2959 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2960 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2962 $LFS getstripe $DIR/$tdir/a1/f1
2963 cancel_lru_locks osc
2965 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2966 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2967 do_facet mds1 $LCTL set_param fail_loc=0x162e
2968 rm -f $DIR/$tdir/a1/f1
2970 do_facet mds1 $LCTL set_param fail_loc=0
2971 cancel_lru_locks mdc
2972 cancel_lru_locks osc
2974 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2975 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2977 for k in $(seq $MDSCOUNT); do
2978 # The LFSCK status query internal is 30 seconds. For the case
2979 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2980 # time to guarantee the status sync up.
2981 wait_update_facet mds${k} "$LCTL get_param -n \
2982 mdd.$(facet_svc mds${k}).lfsck_layout |
2983 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2984 error "(2) MDS${k} is not the expected 'completed'"
2987 for k in $(seq $OSTCOUNT); do
2988 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2989 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2990 awk '/^status/ { print $2 }')
2991 [ "$cur_status" == "completed" ] ||
2992 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2995 local repaired=$(do_facet mds1 $LCTL get_param -n \
2996 mdd.$(facet_svc mds1).lfsck_layout |
2997 awk '/^repaired_orphan/ { print $2 }')
2998 [ $repaired -eq $OSTCOUNT ] ||
2999 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
3001 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
3002 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
3003 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3005 $LFS path2fid $DIR/$tdir/a1/f1
3006 $LFS getstripe $DIR/$tdir/a1/f1
3008 run_test 18g "Find out orphan OST-object and repair it (7)"
3012 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3013 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3014 echo "scanning its OST-object(s). Then in the second stage scanning,"
3015 echo "the OST will return related OST-object(s) to the MDT as orphan."
3016 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3017 echo "the 'orphan(s)' stripe information."
3020 check_mount_and_prep
3022 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3023 error "(0) Fail to create PFL $DIR/$tdir/f0"
3025 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3026 error "(1.1) Fail to write $DIR/$tdir/f0"
3028 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3029 error "(1.2) Fail to write $DIR/$tdir/f0"
3031 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3033 echo "Inject failure stub to simulate bad PFL extent range"
3034 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3035 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3037 chown 1.1 $DIR/$tdir/f0
3039 cancel_lru_locks mdc
3040 cancel_lru_locks osc
3041 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3043 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3044 error "(2) Write to bad PFL file should fail"
3046 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3047 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3049 for k in $(seq $MDSCOUNT); do
3050 # The LFSCK status query internal is 30 seconds. For the case
3051 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3052 # time to guarantee the status sync up.
3053 wait_update_facet mds${k} "$LCTL get_param -n \
3054 mdd.$(facet_svc mds${k}).lfsck_layout |
3055 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3056 error "(4.1) MDS${k} is not the expected 'completed'"
3059 for k in $(seq $OSTCOUNT); do
3060 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3061 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3062 awk '/^status/ { print $2 }')
3063 [ "$cur_status" == "completed" ] ||
3064 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3068 local repaired=$($SHOW_LAYOUT |
3069 awk '/^repaired_orphan/ { print $2 }')
3070 [ $repaired -eq 2 ] ||
3071 error "(5) Fail to repair crashed PFL range: $repaired"
3073 echo "Data in $DIR/$tdir/f0 should not be broken"
3074 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3075 error "(6) Data in $DIR/$tdir/f0 is broken"
3077 echo "Write should succeed after LFSCK repairing the bad PFL range"
3078 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3079 error "(7) Write should succeed after LFSCK"
3081 run_test 18h "LFSCK can repair crashed PFL extent range"
3083 $LCTL set_param debug=-cache > /dev/null
3086 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3087 skip "MDS older than 2.5.55, LU-3951"
3089 check_mount_and_prep
3090 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3092 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3093 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3095 echo "foo1" > $DIR/$tdir/a0
3096 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3097 error "(0) Fail to create PFL $DIR/$tdir/a1"
3098 echo "foo2" > $DIR/$tdir/a1
3099 echo "guard" > $DIR/$tdir/a2
3100 cancel_lru_locks osc
3102 echo "Inject failure, then client will offer wrong parent FID when read"
3103 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3104 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3106 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3107 $LCTL set_param fail_loc=0x1619
3109 echo "Read RPC with wrong parent FID should be denied"
3110 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3111 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3112 $LCTL set_param fail_loc=0
3114 run_test 19a "OST-object inconsistency self detect"
3117 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3118 skip "MDS older than 2.5.55, LU-3951"
3120 check_mount_and_prep
3121 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3123 echo "Inject failure stub to make the OST-object to back point to"
3124 echo "non-exist MDT-object"
3126 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3127 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3129 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3130 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3131 echo "foo1" > $DIR/$tdir/f0
3132 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3133 error "(0) Fail to create PFL $DIR/$tdir/f1"
3134 echo "foo2" > $DIR/$tdir/f1
3135 cancel_lru_locks osc
3136 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3138 do_facet ost1 $LCTL set_param -n \
3139 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3140 echo "Nothing should be fixed since self detect and repair is disabled"
3141 local repaired=$(do_facet ost1 $LCTL get_param -n \
3142 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3143 awk '/^repaired/ { print $2 }')
3144 [ $repaired -eq 0 ] ||
3145 error "(1) Expected 0 repaired, but got $repaired"
3147 echo "Read RPC with right parent FID should be accepted,"
3148 echo "and cause parent FID on OST to be fixed"
3150 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3151 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3153 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3154 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3156 repaired=$(do_facet ost1 $LCTL get_param -n \
3157 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3158 awk '/^repaired/ { print $2 }')
3159 [ $repaired -eq 2 ] ||
3160 error "(3) Expected 1 repaired, but got $repaired"
3162 run_test 19b "OST-object inconsistency self repair"
3164 PATTERN_WITH_HOLE="40000001"
3165 PATTERN_WITHOUT_HOLE="raid0"
3168 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3169 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3170 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3171 skip "MDS older than 2.5.55, LU-4887"
3174 echo "The target MDT-object and some of its OST-object are lost."
3175 echo "The LFSCK should find out the left OST-objects and re-create"
3176 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3177 echo "with the partial OST-objects (LOV EA hole)."
3179 echo "New client can access the file with LOV EA hole via normal"
3180 echo "system tools or commands without crash the system."
3182 echo "For old client, even though it cannot access the file with"
3183 echo "LOV EA hole, it should not cause the system crash."
3186 check_mount_and_prep
3187 $LFS mkdir -i 0 $DIR/$tdir/a1
3188 if [ $OSTCOUNT -gt 2 ]; then
3189 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3192 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3196 # 256 blocks on the stripe0.
3197 # 1 block on the stripe1 for 2 OSTs case.
3198 # 256 blocks on the stripe1 for other cases.
3199 # 1 block on the stripe2 if OSTs > 2
3200 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3201 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3202 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3204 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3205 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3206 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3209 $LFS getstripe $DIR/$tdir/a1/f0
3211 $LFS getstripe $DIR/$tdir/a1/f1
3213 $LFS getstripe $DIR/$tdir/a1/f2
3215 if [ $OSTCOUNT -gt 2 ]; then
3216 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3217 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3219 $LFS getstripe $DIR/$tdir/a1/f3
3222 cancel_lru_locks osc
3224 echo "Inject failure..."
3225 echo "To simulate f0 lost MDT-object"
3226 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3227 do_facet mds1 $LCTL set_param fail_loc=0x1616
3228 rm -f $DIR/$tdir/a1/f0
3230 echo "To simulate f1 lost MDT-object and OST-object0"
3231 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3232 do_facet mds1 $LCTL set_param fail_loc=0x161a
3233 rm -f $DIR/$tdir/a1/f1
3235 echo "To simulate f2 lost MDT-object and OST-object1"
3236 do_facet mds1 $LCTL set_param fail_val=1
3237 rm -f $DIR/$tdir/a1/f2
3239 if [ $OSTCOUNT -gt 2 ]; then
3240 echo "To simulate f3 lost MDT-object and OST-object2"
3241 do_facet mds1 $LCTL set_param fail_val=2
3242 rm -f $DIR/$tdir/a1/f3
3245 umount_client $MOUNT
3248 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3250 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3251 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3253 for k in $(seq $MDSCOUNT); do
3254 # The LFSCK status query internal is 30 seconds. For the case
3255 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3256 # time to guarantee the status sync up.
3257 wait_update_facet mds${k} "$LCTL get_param -n \
3258 mdd.$(facet_svc mds${k}).lfsck_layout |
3259 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3260 error "(2) MDS${k} is not the expected 'completed'"
3263 for k in $(seq $OSTCOUNT); do
3264 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3265 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3266 awk '/^status/ { print $2 }')
3267 [ "$cur_status" == "completed" ] ||
3268 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3271 local repaired=$(do_facet mds1 $LCTL get_param -n \
3272 mdd.$(facet_svc mds1).lfsck_layout |
3273 awk '/^repaired_orphan/ { print $2 }')
3274 if [ $OSTCOUNT -gt 2 ]; then
3275 [ $repaired -eq 9 ] ||
3276 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3278 [ $repaired -eq 4 ] ||
3279 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3282 mount_client $MOUNT || error "(5.0) Fail to start client!"
3284 LOV_PATTERN_F_HOLE=0x40000000
3287 # ${fid0}-R-0 is the old f0
3289 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3290 echo "Check $name, which is the old f0"
3292 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3294 local pattern=$($LFS getstripe -L $name)
3295 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3296 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3298 local stripes=$($LFS getstripe -c $name)
3299 if [ $OSTCOUNT -gt 2 ]; then
3300 [ $stripes -eq 3 ] ||
3301 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3303 [ $stripes -eq 2 ] ||
3304 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3307 local size=$(stat $name | awk '/Size:/ { print $2 }')
3308 [ $size -eq $((4096 * $bcount)) ] ||
3309 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3311 cat $name > /dev/null || error "(5.5) cannot read $name"
3313 echo "dummy" >> $name || error "(5.6) cannot write $name"
3315 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3317 touch $name || error "(5.8) cannot touch $name"
3319 rm -f $name || error "(5.9) cannot unlink $name"
3322 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3324 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3325 if [ $OSTCOUNT -gt 2 ]; then
3326 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3328 echo "Check $name, it contains the old f1's stripe1"
3331 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3333 pattern=$($LFS getstripe -L $name)
3334 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3335 error "(6.2) expect pattern flag hole, but got $pattern"
3337 stripes=$($LFS getstripe -c $name)
3338 if [ $OSTCOUNT -gt 2 ]; then
3339 [ $stripes -eq 3 ] ||
3340 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3342 [ $stripes -eq 2 ] ||
3343 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3346 size=$(stat $name | awk '/Size:/ { print $2 }')
3347 [ $size -eq $((4096 * $bcount)) ] ||
3348 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3350 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3352 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3353 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3356 [ $failures -eq 256 ] ||
3357 error "(6.6) expect 256 IO failures, but get $failures"
3359 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3360 [ $size -eq $((4096 * $bcount)) ] ||
3361 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3363 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3364 error "(6.8) write to the LOV EA hole should fail"
3366 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3367 error "(6.9) write to normal stripe should NOT fail"
3369 echo "foo" >> $name && error "(6.10) append write $name should fail"
3371 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3373 touch $name || error "(6.12) cannot touch $name"
3375 rm -f $name || error "(6.13) cannot unlink $name"
3378 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3380 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3381 if [ $OSTCOUNT -gt 2 ]; then
3382 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3384 echo "Check $name, it contains the old f2's stripe0"
3387 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3389 pattern=$($LFS getstripe -L $name)
3390 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3391 error "(7.2) expect pattern flag hole, but got $pattern"
3393 stripes=$($LFS getstripe -c $name)
3394 size=$(stat $name | awk '/Size:/ { print $2 }')
3395 if [ $OSTCOUNT -gt 2 ]; then
3396 [ $stripes -eq 3 ] ||
3397 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3399 [ $size -eq $((4096 * $bcount)) ] ||
3400 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3402 cat $name > /dev/null &&
3403 error "(7.5.1) normal read $name should fail"
3405 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3406 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3408 [ $failures -eq 256 ] ||
3409 error "(7.6) expect 256 IO failures, but get $failures"
3411 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3412 [ $size -eq $((4096 * $bcount)) ] ||
3413 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3415 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3416 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3418 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3419 error "(7.8.1) write to normal stripe should NOT fail"
3421 echo "foo" >> $name &&
3422 error "(7.8.3) append write $name should fail"
3424 chown $RUNAS_ID:$RUNAS_GID $name ||
3425 error "(7.9.1) cannot chown on $name"
3427 touch $name || error "(7.10.1) cannot touch $name"
3429 [ $stripes -eq 2 ] ||
3430 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3433 [ $size -eq $((4096 * (256 + 0))) ] ||
3434 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3436 cat $name > /dev/null &&
3437 error "(7.5.2) normal read $name should fail"
3439 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3440 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3441 [ $failures -eq 256 ] ||
3442 error "(7.6.2) expect 256 IO failures, but get $failures"
3445 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3446 [ $size -eq $((4096 * $bcount)) ] ||
3447 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3449 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3450 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3452 chown $RUNAS_ID:$RUNAS_GID $name ||
3453 error "(7.9.2) cannot chown on $name"
3455 touch $name || error "(7.10.2) cannot touch $name"
3458 rm -f $name || error "(7.11) cannot unlink $name"
3460 [ $OSTCOUNT -le 2 ] && return
3463 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3465 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3466 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3468 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3470 pattern=$($LFS getstripe -L $name)
3471 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3472 error "(8.2) expect pattern flag hole, but got $pattern"
3474 stripes=$($LFS getstripe -c $name)
3475 [ $stripes -eq 3 ] ||
3476 error "(8.3) expect the stripe count is 3, but got $stripes"
3478 size=$(stat $name | awk '/Size:/ { print $2 }')
3480 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3481 error "(8.4) expect the size $((4096 * 512)), but got $size"
3483 cat $name > /dev/null &&
3484 error "(8.5) normal read $name should fail"
3486 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3487 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3489 [ $failures -eq 256 ] ||
3490 error "(8.6) expect 256 IO failures, but get $failures"
3493 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3494 [ $size -eq $((4096 * $bcount)) ] ||
3495 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3497 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3498 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3500 chown $RUNAS_ID:$RUNAS_GID $name ||
3501 error "(8.9) cannot chown on $name"
3503 touch $name || error "(8.10) cannot touch $name"
3505 rm -f $name || error "(8.11) cannot unlink $name"
3507 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3510 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3511 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3512 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3513 skip "MDS older than 2.5.55, LU-4887"
3516 echo "The target MDT-object and some of its OST-object are lost."
3517 echo "The LFSCK should find out the left OST-objects and re-create"
3518 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3519 echo "with the partial OST-objects (LOV EA hole)."
3521 echo "New client can access the file with LOV EA hole via normal"
3522 echo "system tools or commands without crash the system - PFL case."
3525 check_mount_and_prep
3527 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3528 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3529 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3530 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3531 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3532 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3534 local bcount=$((256 * 3 + 1))
3536 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3537 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3538 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3540 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3541 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3542 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3545 $LFS getstripe $DIR/$tdir/f0
3547 $LFS getstripe $DIR/$tdir/f1
3549 $LFS getstripe $DIR/$tdir/f2
3551 cancel_lru_locks mdc
3552 cancel_lru_locks osc
3554 echo "Inject failure..."
3555 echo "To simulate f0 lost MDT-object"
3556 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3557 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3560 echo "To simulate the case of f1 lost MDT-object and "
3561 echo "the first OST-object in each PFL component"
3562 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3566 echo "To simulate the case of f2 lost MDT-object and "
3567 echo "the second OST-object in each PFL component"
3568 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3575 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3576 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3578 for k in $(seq $MDSCOUNT); do
3579 # The LFSCK status query internal is 30 seconds. For the case
3580 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3581 # time to guarantee the status sync up.
3582 wait_update_facet mds${k} "$LCTL get_param -n \
3583 mdd.$(facet_svc mds${k}).lfsck_layout |
3584 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3585 error "(4) MDS${k} is not the expected 'completed'"
3588 for k in $(seq $OSTCOUNT); do
3589 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3590 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3591 awk '/^status/ { print $2 }')
3592 [ "$cur_status" == "completed" ] ||
3593 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3596 local repaired=$(do_facet mds1 $LCTL get_param -n \
3597 mdd.$(facet_svc mds1).lfsck_layout |
3598 awk '/^repaired_orphan/ { print $2 }')
3599 [ $repaired -eq 8 ] ||
3600 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3603 # ${fid0}-R-0 is the old f0
3605 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3606 echo "Check $name, which is the old f0"
3608 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3610 local pattern=$($LFS getstripe -L -I1 $name)
3611 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3612 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3614 pattern=$($LFS getstripe -L -I2 $name)
3615 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3616 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3618 local stripes=$($LFS getstripe -c -I1 $name)
3619 [ $stripes -eq 2 ] ||
3620 error "(7.3.1) expect 2 stripes, but got $stripes"
3622 stripes=$($LFS getstripe -c -I2 $name)
3623 [ $stripes -eq 2 ] ||
3624 error "(7.3.2) expect 2 stripes, but got $stripes"
3626 local e_start=$($LFS getstripe -I1 $name |
3627 awk '/lcme_extent.e_start:/ { print $2 }')
3628 [ $e_start -eq 0 ] ||
3629 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3631 local e_end=$($LFS getstripe -I1 $name |
3632 awk '/lcme_extent.e_end:/ { print $2 }')
3633 [ $e_end -eq 2097152 ] ||
3634 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3636 e_start=$($LFS getstripe -I2 $name |
3637 awk '/lcme_extent.e_start:/ { print $2 }')
3638 [ $e_start -eq 2097152 ] ||
3639 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3641 e_end=$($LFS getstripe -I2 $name |
3642 awk '/lcme_extent.e_end:/ { print $2 }')
3643 [ "$e_end" = "EOF" ] ||
3644 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3646 local size=$(stat $name | awk '/Size:/ { print $2 }')
3647 [ $size -eq $((4096 * $bcount)) ] ||
3648 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3650 cat $name > /dev/null || error "(7.7) cannot read $name"
3652 echo "dummy" >> $name || error "(7.8) cannot write $name"
3654 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3656 touch $name || error "(7.10) cannot touch $name"
3658 rm -f $name || error "(7.11) cannot unlink $name"
3661 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3663 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3664 echo "Check $name, it contains f1's second OST-object in each COMP"
3666 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3668 pattern=$($LFS getstripe -L -I1 $name)
3669 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3670 error "(8.2.1) expect pattern flag hole, but got $pattern"
3672 pattern=$($LFS getstripe -L -I2 $name)
3673 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3674 error "(8.2.2) expect pattern flag hole, but got $pattern"
3676 stripes=$($LFS getstripe -c -I1 $name)
3677 [ $stripes -eq 2 ] ||
3678 error "(8.3.2) expect 2 stripes, but got $stripes"
3680 stripes=$($LFS getstripe -c -I2 $name)
3681 [ $stripes -eq 2 ] ||
3682 error "(8.3.2) expect 2 stripes, but got $stripes"
3684 e_start=$($LFS getstripe -I1 $name |
3685 awk '/lcme_extent.e_start:/ { print $2 }')
3686 [ $e_start -eq 0 ] ||
3687 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3689 e_end=$($LFS getstripe -I1 $name |
3690 awk '/lcme_extent.e_end:/ { print $2 }')
3691 [ $e_end -eq 2097152 ] ||
3692 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3694 e_start=$($LFS getstripe -I2 $name |
3695 awk '/lcme_extent.e_start:/ { print $2 }')
3696 [ $e_start -eq 2097152 ] ||
3697 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3699 e_end=$($LFS getstripe -I2 $name |
3700 awk '/lcme_extent.e_end:/ { print $2 }')
3701 [ "$e_end" = "EOF" ] ||
3702 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3704 size=$(stat $name | awk '/Size:/ { print $2 }')
3705 [ $size -eq $((4096 * $bcount)) ] ||
3706 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3708 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3710 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3711 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3713 # The first stripe in each COMP was lost
3714 [ $failures -eq 512 ] ||
3715 error "(8.8) expect 512 IO failures, but get $failures"
3717 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3718 [ $size -eq $((4096 * $bcount)) ] ||
3719 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3721 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3722 error "(8.10) write to the LOV EA hole should fail"
3724 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3725 error "(8.11) write to normal stripe should NOT fail"
3727 echo "foo" >> $name && error "(8.12) append write $name should fail"
3729 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3731 touch $name || error "(8.14) cannot touch $name"
3733 rm -f $name || error "(8.15) cannot unlink $name"
3736 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3738 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3739 echo "Check $name, it contains f2's first stripe in each COMP"
3741 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3743 pattern=$($LFS getstripe -L -I1 $name)
3744 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3745 error "(9.2.1) expect pattern flag hole, but got $pattern"
3747 pattern=$($LFS getstripe -L -I2 $name)
3748 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3749 error "(9.2.2) expect pattern flag hole, but got $pattern"
3751 stripes=$($LFS getstripe -c -I1 $name)
3752 [ $stripes -eq 2 ] ||
3753 error "(9.3.2) expect 2 stripes, but got $stripes"
3755 stripes=$($LFS getstripe -c -I2 $name)
3756 [ $stripes -eq 2 ] ||
3757 error "(9.3.2) expect 2 stripes, but got $stripes"
3759 e_start=$($LFS getstripe -I1 $name |
3760 awk '/lcme_extent.e_start:/ { print $2 }')
3761 [ $e_start -eq 0 ] ||
3762 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3764 e_end=$($LFS getstripe -I1 $name |
3765 awk '/lcme_extent.e_end:/ { print $2 }')
3766 [ $e_end -eq 2097152 ] ||
3767 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3769 e_start=$($LFS getstripe -I2 $name |
3770 awk '/lcme_extent.e_start:/ { print $2 }')
3771 [ $e_start -eq 2097152 ] ||
3772 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3774 e_end=$($LFS getstripe -I2 $name |
3775 awk '/lcme_extent.e_end:/ { print $2 }')
3776 [ "$e_end" = "EOF" ] ||
3777 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3779 size=$(stat $name | awk '/Size:/ { print $2 }')
3780 # The second stripe in COMP was lost, so we do not know there
3781 # have ever been some data before. 'stat' will regard it as
3782 # no data on the lost stripe.
3784 [ $size -eq $((4096 * $bcount)) ] ||
3785 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3787 cat $name > /dev/null &&
3788 error "(9.7) normal read $name should fail"
3790 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3791 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3792 [ $failures -eq 512 ] ||
3793 error "(9.8) expect 256 IO failures, but get $failures"
3795 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3796 # The second stripe in COMP was lost, so we do not know there
3797 # have ever been some data before. Since 'dd' skip failure,
3798 # it will regard the lost stripe contains data.
3800 [ $size -eq $((4096 * $bcount)) ] ||
3801 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3803 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3804 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3806 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3807 error "(9.11) write to normal stripe should NOT fail"
3809 echo "foo" >> $name &&
3810 error "(9.12) append write $name should fail"
3812 chown $RUNAS_ID:$RUNAS_GID $name ||
3813 error "(9.13) cannot chown on $name"
3815 touch $name || error "(9.14) cannot touch $name"
3817 rm -f $name || error "(7.15) cannot unlink $name"
3819 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3822 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3823 skip "MDS older than 2.5.59, LU-4887"
3825 check_mount_and_prep
3826 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3828 echo "Start all LFSCK components by default (-s 1)"
3829 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3830 error "Fail to start LFSCK"
3832 echo "namespace LFSCK should be in 'scanning-phase1' status"
3833 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3834 [ "$STATUS" == "scanning-phase1" ] ||
3835 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3837 echo "layout LFSCK should be in 'scanning-phase1' status"
3838 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3839 [ "$STATUS" == "scanning-phase1" ] ||
3840 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3842 echo "Stop all LFSCK components by default"
3843 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3844 error "Fail to stop LFSCK"
3846 run_test 21 "run all LFSCK components by default"
3849 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3850 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3851 skip "MDS older than 2.6.50, LU-5511"
3854 echo "The parent_A references the child directory via some name entry,"
3855 echo "but the child directory back references another parent_B via its"
3856 echo "".." name entry. The parent_B does not exist. Then the namespace"
3857 echo "LFSCK will repair the child directory's ".." name entry."
3860 check_mount_and_prep
3862 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3863 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3865 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3866 echo "The dummy's dotdot name entry references the guard."
3867 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3869 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3870 error "(3) Fail to mkdir on MDT0"
3871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3873 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3875 echo "Trigger namespace LFSCK to repair unmatched pairs"
3876 $START_NAMESPACE -A -r ||
3877 error "(5) Fail to start LFSCK for namespace"
3879 wait_all_targets_blocked namespace completed 6
3881 local repaired=$($SHOW_NAMESPACE |
3882 awk '/^unmatched_pairs_repaired/ { print $2 }')
3883 [ $repaired -eq 1 ] ||
3884 error "(7) Fail to repair unmatched pairs: $repaired"
3886 echo "'ls' should success after namespace LFSCK repairing"
3887 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3888 error "(8) ls should success."
3890 run_test 22a "LFSCK can repair unmatched pairs (1)"
3893 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3894 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3895 skip "MDS older than 2.6.50, LU-5511"
3898 echo "The parent_A references the child directory via the name entry_B,"
3899 echo "but the child directory back references another parent_C via its"
3900 echo "".." name entry. The parent_C exists, but there is no the name"
3901 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3902 echo "the child directory's ".." name entry and its linkEA."
3905 check_mount_and_prep
3907 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3908 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3910 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3911 echo "and bad linkEA. The dummy's dotdot name entry references the"
3912 echo "guard. The dummy's linkEA references n non-exist name entry."
3913 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3914 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3915 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3916 error "(3) Fail to mkdir on MDT0"
3917 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3919 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3920 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3921 local dummyname=$($LFS fid2path $DIR $dummyfid)
3922 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3923 error "(4) fid2path works unexpectedly."
3925 echo "Trigger namespace LFSCK to repair unmatched pairs"
3926 $START_NAMESPACE -A -r ||
3927 error "(5) Fail to start LFSCK for namespace"
3929 wait_all_targets_blocked namespace completed 6
3931 local repaired=$($SHOW_NAMESPACE |
3932 awk '/^unmatched_pairs_repaired/ { print $2 }')
3933 [ $repaired -eq 1 ] ||
3934 error "(7) Fail to repair unmatched pairs: $repaired"
3936 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3937 local dummyname=$($LFS fid2path $DIR $dummyfid)
3938 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3939 error "(8) fid2path does not work"
3941 run_test 22b "LFSCK can repair unmatched pairs (2)"
3944 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3945 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3946 skip "MDS older than 2.6.50, LU-5512"
3949 echo "The name entry is there, but the MDT-object for such name "
3950 echo "entry does not exist. The namespace LFSCK should find out "
3951 echo "and repair the inconsistency as required."
3954 check_mount_and_prep
3956 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3957 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3959 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3960 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3961 do_facet mds2 $LCTL set_param fail_loc=0x1620
3962 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3963 do_facet mds2 $LCTL set_param fail_loc=0
3965 echo "'ls' should fail because of dangling name entry"
3966 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3968 echo "Trigger namespace LFSCK to find out dangling name entry"
3969 $START_NAMESPACE -A -r ||
3970 error "(5) Fail to start LFSCK for namespace"
3972 wait_all_targets_blocked namespace completed 6
3974 local repaired=$($SHOW_NAMESPACE |
3975 awk '/^dangling_repaired/ { print $2 }')
3976 [ $repaired -eq 1 ] ||
3977 error "(7) Fail to repair dangling name entry: $repaired"
3979 echo "'ls' should fail because not re-create MDT-object by default"
3980 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3982 echo "Trigger namespace LFSCK again to repair dangling name entry"
3983 $START_NAMESPACE -A -r -C ||
3984 error "(9) Fail to start LFSCK for namespace"
3986 wait_all_targets_blocked namespace completed 10
3988 repaired=$($SHOW_NAMESPACE |
3989 awk '/^dangling_repaired/ { print $2 }')
3990 [ $repaired -eq 1 ] ||
3991 error "(11) Fail to repair dangling name entry: $repaired"
3993 echo "'ls' should success after namespace LFSCK repairing"
3994 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3996 run_test 23a "LFSCK can repair dangling name entry (1)"
3999 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4000 skip "MDS older than 2.6.50, LU-5512"
4003 echo "The objectA has multiple hard links, one of them corresponding"
4004 echo "to the name entry_B. But there is something wrong for the name"
4005 echo "entry_B and cause entry_B to references non-exist object_C."
4006 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4007 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4008 echo "comes to the second-stage scanning, it will find that the"
4009 echo "former re-creating object_C is not proper, and will try to"
4010 echo "replace the object_C with the real object_A."
4013 check_mount_and_prep
4015 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4016 $LFS path2fid $DIR/$tdir/d0
4018 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4020 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4021 $LFS path2fid $DIR/$tdir/d0/f0
4023 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4024 $LFS path2fid $DIR/$tdir/d0/f1
4026 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4027 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4029 if [ "$SEQ0" != "$SEQ1" ]; then
4030 # To guarantee that the f0 and f1 are in the same FID seq
4031 rm -f $DIR/$tdir/d0/f0 ||
4032 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4033 echo "dummy" > $DIR/$tdir/d0/f0 ||
4034 error "(3.2) Fail to touch on MDT0"
4035 $LFS path2fid $DIR/$tdir/d0/f0
4038 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4039 OID=$(printf %d $OID)
4041 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4042 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4043 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4044 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4045 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4047 # If there is creation after the dangling injection, it may re-use
4048 # the just released local object (inode) that is referenced by the
4049 # dangling name entry. It will fail the dangling injection.
4050 # So before deleting the target object for the dangling name entry,
4051 # remove some other objects to avoid the target object being reused
4052 # by some potential creations. LU-7429
4053 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4055 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4057 echo "'ls' should fail because of dangling name entry"
4058 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4059 error "(6) ls should fail."
4061 echo "Trigger namespace LFSCK to find out dangling name entry"
4062 $START_NAMESPACE -r -C ||
4063 error "(7) Fail to start LFSCK for namespace"
4065 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4066 mdd.${MDT_DEV}.lfsck_namespace |
4067 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4069 error "(8) unexpected status"
4072 local repaired=$($SHOW_NAMESPACE |
4073 awk '/^dangling_repaired/ { print $2 }')
4074 [ $repaired -eq 1 ] ||
4075 error "(9) Fail to repair dangling name entry: $repaired"
4077 repaired=$($SHOW_NAMESPACE |
4078 awk '/^multiple_linked_repaired/ { print $2 }')
4079 [ $repaired -eq 1 ] ||
4080 error "(10) Fail to drop the former created object: $repaired"
4082 local data=$(cat $DIR/$tdir/d0/foo)
4083 [ "$data" == "dummy" ] ||
4084 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4086 run_test 23b "LFSCK can repair dangling name entry (2)"
4089 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4090 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4091 mdd.${MDT_DEV}.lfsck_namespace |
4092 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4094 error "(10) unexpected status"
4097 stop_full_debug_logging
4101 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4102 skip "MDS older than 2.6.50, LU-5512"
4105 echo "The objectA has multiple hard links, one of them corresponding"
4106 echo "to the name entry_B. But there is something wrong for the name"
4107 echo "entry_B and cause entry_B to references non-exist object_C."
4108 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4109 echo "as dangling, and re-create the lost object_C. And then others"
4110 echo "modified the re-created object_C. When the LFSCK comes to the"
4111 echo "second-stage scanning, it will find that the former re-creating"
4112 echo "object_C maybe wrong and try to replace the object_C with the"
4113 echo "real object_A. But because object_C has been modified, so the"
4114 echo "LFSCK cannot replace it."
4117 start_full_debug_logging
4118 stack_trap stop_full_debug_logging
4120 check_mount_and_prep
4122 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4123 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4124 echo "parent_fid=$parent_fid"
4126 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4128 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4129 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4130 echo "f0_fid=$f0_fid"
4132 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4133 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4134 echo "f1_fid=$f1_fid"
4136 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4137 # To guarantee that the f0 and f1 are in the same FID seq
4138 rm -f $DIR/$tdir/d0/f0 ||
4139 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4140 echo "dummy" > $DIR/$tdir/d0/f0 ||
4141 error "(3.2) Fail to touch on MDT0"
4142 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4143 echo "f0_fid=$f0_fid (replaced)"
4146 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4148 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4149 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4150 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4151 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4152 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4154 # If there is creation after the dangling injection, it may re-use
4155 # the just released local object (inode) that is referenced by the
4156 # dangling name entry. It will fail the dangling injection.
4157 # So before deleting the target object for the dangling name entry,
4158 # remove some other objects to avoid the target object being reused
4159 # by some potential creations. LU-7429
4160 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4162 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4164 echo "'ls' should fail because of dangling name entry"
4165 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4166 error "(6) ls should fail."
4168 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4169 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4171 echo "Trigger namespace LFSCK to find out dangling name entry"
4172 $START_NAMESPACE -r -C ||
4173 error "(7) Fail to start LFSCK for namespace"
4175 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4176 # While unexpected by the test, it is valid for LFSCK to repair
4177 # the link to the original object before any data is written.
4178 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4180 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4181 log "LFSCK repaired file prematurely"
4186 stat $DIR/$tdir/d0/foo
4188 error "(8) unexpected size"
4191 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4192 cancel_lru_locks osc
4196 local repaired=$($SHOW_NAMESPACE |
4197 awk '/^dangling_repaired/ { print $2 }')
4198 [ $repaired -eq 1 ] ||
4199 error "(11) Fail to repair dangling name entry: $repaired"
4201 local data=$(cat $DIR/$tdir/d0/foo)
4202 [ "$data" != "dummy" ] ||
4203 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4205 run_test 23c "LFSCK can repair dangling name entry (3)"
4208 (( MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
4209 [[ $mds1_FSTYPE == ldiskfs ]] ||
4210 skip "ldiskfs-only test due to a low-level mds fs access"
4213 $LFS mkdir -i 0 $DIR/$tdir/mdt0dir
4214 $LFS mkdir -i 1 $DIR/$tdir/mdt1dir
4216 echo "b-a-r" > $DIR/$tdir/mdt0dir/foo
4217 local foofid=$($LFS path2fid $DIR/$tdir/mdt0dir/foo | sed -E 's/^.(.*).$/\1/')
4219 mv $DIR/$tdir/mdt0dir/foo $DIR/$tdir/mdt1dir/
4223 local devname=$(mdsdevname 1)
4224 local cmd="$DEBUGFS -w -R \\\"rm /REMOTE_PARENT_DIR/${foofid}\\\" $devname"
4225 do_facet mds1 "$cmd"
4227 start mds1 $devname $MDS_MOUNT_OPTS || error "start mds1 failed"
4229 cat $DIR/$tdir/mdt1dir/foo && error "file read should fail"
4231 do_facet mds2 $LCTL lfsck_start -M ${FSNAME}-MDT0001 -t namespace -C ||
4232 error "lfsck namespace failed to start"
4233 wait_update_facet mds2 "$LCTL get_param -n \
4234 mdd.${FSNAME}-MDT0001.lfsck_namespace |
4235 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4236 error " unexpected lfsck status"
4238 cat $DIR/$tdir/mdt1dir/foo || error "file read should succeed"
4240 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -o ||
4241 error "lfsck namespace failed to start"
4243 # lfsck -t layout -o broadcasts all MDTs to perform lfsck layout,
4245 local count=$(do_facet mds1 $LCTL lfsck_query -t layout -w |
4246 awk '/layout_mdts_completed:/ { print $2 }')
4247 (( count != MDSCOUNT )) &&
4248 error "Only $count/$MDSCOUNT lfsck completed"
4250 cmp $DIR/$tdir/mdt1dir/foo <(echo "b-a-r") || error "file body has changed"
4252 run_test 23d "LFSCK can repair a dangling name entry to a remote object"
4255 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4256 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4257 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4258 skip "MDS older than 2.6.50, LU-5513"
4261 echo "Two MDT-objects back reference the same name entry via their"
4262 echo "each own linkEA entry, but the name entry only references one"
4263 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4264 echo "for the MDT-object that is not recognized. If such MDT-object"
4265 echo "has no other linkEA entry after the removing, then the LFSCK"
4266 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4269 check_mount_and_prep
4271 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4273 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4274 $LFS path2fid $DIR/$tdir/d0/guard
4276 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4277 $LFS path2fid $DIR/$tdir/d0/dummy
4280 if [ $mds1_FSTYPE != ldiskfs ]; then
4281 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4283 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4286 touch $DIR/$tdir/d0/guard/foo ||
4287 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4289 echo "Inject failure stub on MDT0 to simulate the case that"
4290 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4291 echo "that references $DIR/$tdir/d0/guard/foo."
4292 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4293 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4294 echo "there with the same linkEA entry as another MDT-object"
4295 echo "$DIR/$tdir/d0/guard/foo has"
4297 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4299 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4300 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4301 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4302 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4303 rmdir $DIR/$tdir/d0/dummy/foo ||
4304 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4307 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4308 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4309 error "(6) stat successfully unexpectedly"
4311 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4312 $START_NAMESPACE -A -r ||
4313 error "(7) Fail to start LFSCK for namespace"
4315 wait_all_targets_blocked namespace completed 8
4317 local repaired=$($SHOW_NAMESPACE |
4318 awk '/^multiple_referenced_repaired/ { print $2 }')
4319 [ $repaired -eq 1 ] ||
4320 error "(9) Fail to repair multiple referenced name entry: $repaired"
4322 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4323 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4324 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4326 local cname="$cfid-$pfid-D-0"
4327 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4328 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4330 run_test 24 "LFSCK can repair multiple-referenced name entry"
4333 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4334 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4335 skip "MDS older than 2.6.50, LU-5515"
4338 echo "The file type in the name entry does not match the file type"
4339 echo "claimed by the referenced object. Then the LFSCK will update"
4340 echo "the file type in the name entry."
4343 check_mount_and_prep
4345 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4347 echo "Inject failure stub on MDT0 to simulate the case that"
4348 echo "the file type stored in the name entry is wrong."
4350 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4352 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4353 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4355 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4356 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4358 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4359 mdd.${MDT_DEV}.lfsck_namespace |
4360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4362 error "(4) unexpected status"
4365 local repaired=$($SHOW_NAMESPACE |
4366 awk '/^bad_file_type_repaired/ { print $2 }')
4367 [ $repaired -eq 1 ] ||
4368 error "(5) Fail to repair bad file type in name entry: $repaired"
4370 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4372 run_test 25 "LFSCK can repair bad file type in the name entry"
4375 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4376 skip "MDS older than 2.6.50, LU-5516"
4379 echo "The local name entry back referenced by the MDT-object is lost."
4380 echo "The namespace LFSCK will add the missing local name entry back"
4381 echo "to the normal namespace."
4384 check_mount_and_prep
4386 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4387 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4388 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4390 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4391 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4393 echo "Inject failure stub on MDT0 to simulate the case that"
4394 echo "foo's name entry will be removed, but the foo's object"
4395 echo "and its linkEA are kept in the system."
4397 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4399 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4400 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4402 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4403 error "(5) 'ls' should fail"
4405 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4406 $START_NAMESPACE -r -A ||
4407 error "(6) Fail to start LFSCK for namespace"
4409 wait_all_targets_blocked namespace completed 7
4411 local repaired=$($SHOW_NAMESPACE |
4412 awk '/^lost_dirent_repaired/ { print $2 }')
4413 [ $repaired -eq 1 ] ||
4414 error "(8) Fail to repair lost dirent: $repaired"
4416 ls -ail $DIR/$tdir/d0/foo ||
4417 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4419 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4420 [ "$foofid" == "$foofid2" ] ||
4421 error "(10) foo's FID changed: $foofid, $foofid2"
4423 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4426 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4427 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4428 skip "MDS older than 2.6.50, LU-5516"
4431 echo "The remote name entry back referenced by the MDT-object is lost."
4432 echo "The namespace LFSCK will add the missing remote name entry back"
4433 echo "to the normal namespace."
4436 check_mount_and_prep
4438 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4439 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4440 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4442 echo "Inject failure stub on MDT0 to simulate the case that"
4443 echo "foo's name entry will be removed, but the foo's object"
4444 echo "and its linkEA are kept in the system."
4446 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4448 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4451 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4452 error "(4) 'ls' should fail"
4454 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4455 $START_NAMESPACE -r -A ||
4456 error "(5) Fail to start LFSCK for namespace"
4458 wait_all_targets_blocked namespace completed 6
4460 local repaired=$($SHOW_NAMESPACE |
4461 awk '/^lost_dirent_repaired/ { print $2 }')
4462 [ $repaired -eq 1 ] ||
4463 error "(7) Fail to repair lost dirent: $repaired"
4465 ls -ail $DIR/$tdir/d0/foo ||
4466 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4468 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4469 [ "$foofid" == "$foofid2" ] ||
4470 error "(9) foo's FID changed: $foofid, $foofid2"
4472 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4475 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4476 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4477 skip "MDS older than 2.6.50, LU-5516"
4480 echo "The local parent referenced by the MDT-object linkEA is lost."
4481 echo "The namespace LFSCK will re-create the lost parent as orphan."
4484 check_mount_and_prep
4486 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4487 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4488 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4489 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4491 echo "Inject failure stub on MDT0 to simulate the case that"
4492 echo "foo's name entry will be removed, but the foo's object"
4493 echo "and its linkEA are kept in the system. And then remove"
4494 echo "another hard link and the parent directory."
4496 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4498 rm -f $DIR/$tdir/d0/foo ||
4499 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4500 rm -f $DIR/$tdir/d0/dummy ||
4501 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4504 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4505 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4507 echo "Trigger namespace LFSCK to repair the lost parent"
4508 $START_NAMESPACE -r -A ||
4509 error "(6) Fail to start LFSCK for namespace"
4511 wait_all_targets_blocked namespace completed 7
4513 local repaired=$($SHOW_NAMESPACE |
4514 awk '/^lost_dirent_repaired/ { print $2 }')
4515 [ $repaired -eq 1 ] ||
4516 error "(8) Fail to repair lost dirent: $repaired"
4518 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4519 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4520 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4522 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4524 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4525 [ ! -z "$cname" ] ||
4526 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4528 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4531 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4532 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4533 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4534 skip "MDS older than 2.6.50, LU-5516"
4537 echo "The remote parent referenced by the MDT-object linkEA is lost."
4538 echo "The namespace LFSCK will re-create the lost parent as orphan."
4541 check_mount_and_prep
4543 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4544 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4546 $LFS path2fid $DIR/$tdir/d0
4548 echo "Inject failure stub on MDT0 to simulate the case that"
4549 echo "foo's name entry will be removed, but the foo's object"
4550 echo "and its linkEA are kept in the system. And then remove"
4551 echo "the parent directory."
4553 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4555 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4556 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4558 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4559 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4561 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4562 $START_NAMESPACE -r -A ||
4563 error "(6) Fail to start LFSCK for namespace"
4565 wait_all_targets_blocked namespace completed 7
4567 local repaired=$($SHOW_NAMESPACE |
4568 awk '/^lost_dirent_repaired/ { print $2 }')
4569 [ $repaired -eq 1 ] ||
4570 error "(8) Fail to repair lost dirent: $repaired"
4572 ls -ail $MOUNT/.lustre/lost+found/
4574 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4575 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4576 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4578 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4580 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4581 [ ! -z "$cname" ] ||
4582 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4584 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4587 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4588 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4589 skip "MDS older than 2.6.50, LU-5506"
4592 echo "The target name entry is lost. The LFSCK should insert the"
4593 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4594 echo "the MDT (on which the orphan MDT-object resides) has ever"
4595 echo "failed to respond some name entry verification during the"
4596 echo "first stage-scanning, then the LFSCK should skip to handle"
4597 echo "orphan MDT-object on this MDT. But other MDTs should not"
4601 check_mount_and_prep
4602 $LFS mkdir -i 0 $DIR/$tdir/d1
4603 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4604 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4606 $LFS mkdir -i 1 $DIR/$tdir/d2
4607 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4608 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4610 echo "Inject failure stub on MDT0 to simulate the case that"
4611 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4612 echo "and its linkEA are kept in the system. And the case that"
4613 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4614 echo "and its linkEA are kept in the system."
4616 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4617 do_facet mds1 $LCTL set_param fail_loc=0x1624
4618 do_facet mds2 $LCTL set_param fail_loc=0x1624
4619 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4620 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4621 do_facet mds1 $LCTL set_param fail_loc=0
4622 do_facet mds2 $LCTL set_param fail_loc=0
4624 cancel_lru_locks mdc
4625 cancel_lru_locks osc
4627 echo "Inject failure, to simulate the MDT0 fail to handle"
4628 echo "MDT1 LFSCK request during the first-stage scanning."
4629 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4630 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4632 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4633 $START_NAMESPACE -r -A ||
4634 error "(3) Fail to start LFSCK for namespace"
4636 wait_update_facet mds1 "$LCTL get_param -n \
4637 mdd.$(facet_svc mds1).lfsck_namespace |
4638 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4639 error "(4) mds1 is not the expected 'partial'"
4642 wait_update_facet mds2 "$LCTL get_param -n \
4643 mdd.$(facet_svc mds2).lfsck_namespace |
4644 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4645 error "(5) mds2 is not the expected 'completed'"
4648 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4650 local repaired=$(do_facet mds1 $LCTL get_param -n \
4651 mdd.$(facet_svc mds1).lfsck_namespace |
4652 awk '/^lost_dirent_repaired/ { print $2 }')
4653 [ $repaired -eq 0 ] ||
4654 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4656 repaired=$(do_facet mds2 $LCTL get_param -n \
4657 mdd.$(facet_svc mds2).lfsck_namespace |
4658 awk '/^lost_dirent_repaired/ { print $2 }')
4659 [ $repaired -eq 1 ] ||
4660 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4662 echo "Trigger namespace LFSCK on all devices again to cleanup"
4663 $START_NAMESPACE -r -A ||
4664 error "(8) Fail to start LFSCK for namespace"
4666 wait_all_targets_blocked namespace completed 9
4668 local repaired=$(do_facet mds1 $LCTL get_param -n \
4669 mdd.$(facet_svc mds1).lfsck_namespace |
4670 awk '/^lost_dirent_repaired/ { print $2 }')
4671 [ $repaired -eq 1 ] ||
4672 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4674 repaired=$(do_facet mds2 $LCTL get_param -n \
4675 mdd.$(facet_svc mds2).lfsck_namespace |
4676 awk '/^lost_dirent_repaired/ { print $2 }')
4677 [ $repaired -eq 0 ] ||
4678 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4680 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4683 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4684 skip "MDS older than 2.6.50, LU-5517"
4687 echo "The object's nlink attribute is larger than the object's known"
4688 echo "name entries count. The LFSCK will repair the object's nlink"
4689 echo "attribute to match the known name entries count"
4692 check_mount_and_prep
4694 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4695 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4697 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4698 echo "nlink attribute is larger than its name entries count."
4700 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4701 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4702 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4703 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4704 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4706 cancel_lru_locks mdc
4707 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4708 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4710 echo "Trigger namespace LFSCK to repair the nlink count"
4711 $START_NAMESPACE -r -A ||
4712 error "(5) Fail to start LFSCK for namespace"
4714 wait_all_targets_blocked namespace completed 6
4716 local repaired=$($SHOW_NAMESPACE |
4717 awk '/^nlinks_repaired/ { print $2 }')
4718 [ $repaired -eq 1 ] ||
4719 error "(7) Fail to repair nlink count: $repaired"
4721 cancel_lru_locks mdc
4722 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4723 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4725 # Disable 29a, we only allow nlink to be updated if the known linkEA
4726 # entries is larger than nlink count.
4728 #run_test 29a "LFSCK can repair bad nlink count (1)"
4731 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4732 skip "MDS older than 2.6.50, LU-5517"
4735 echo "The object's nlink attribute is smaller than the object's known"
4736 echo "name entries count. The LFSCK will repair the object's nlink"
4737 echo "attribute to match the known name entries count"
4740 check_mount_and_prep
4742 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4743 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4745 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4746 echo "nlink attribute is smaller than its name entries count."
4748 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4749 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4750 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4751 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4752 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4754 cancel_lru_locks mdc
4755 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4756 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4758 echo "Trigger namespace LFSCK to repair the nlink count"
4759 $START_NAMESPACE -r -A ||
4760 error "(5) Fail to start LFSCK for namespace"
4762 wait_all_targets_blocked namespace completed 6
4764 local repaired=$($SHOW_NAMESPACE |
4765 awk '/^nlinks_repaired/ { print $2 }')
4766 [ $repaired -eq 1 ] ||
4767 error "(7) Fail to repair nlink count: $repaired"
4769 cancel_lru_locks mdc
4770 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4771 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4773 run_test 29b "LFSCK can repair bad nlink count (2)"
4777 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4778 skip "MDS older than 2.6.50, LU-5517"
4781 echo "The namespace LFSCK will create many hard links to the target"
4782 echo "file as to exceed the linkEA size limitation. Under such case"
4783 echo "the linkEA will be marked as overflow that will prevent the"
4784 echo "target file to be migrated. Then remove some hard links to"
4785 echo "make the left hard links to be held within the linkEA size"
4786 echo "limitation. But before the namespace LFSCK adding all the"
4787 echo "missed linkEA entries back, the overflow mark (timestamp)"
4788 echo "will not be cleared."
4791 check_mount_and_prep
4793 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4794 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4795 error "(0.2) Fail to mkdir"
4796 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4797 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4799 # define MAX_LINKEA_SIZE 4096
4800 # sizeof(link_ea_header) = 24
4801 # sizeof(link_ea_entry) = 18
4802 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4803 # (sizeof(link_ea_entry) + name_length))
4804 # If the average name length is 12 bytes, then 150 hard links
4805 # is totally enough to overflow the linkEA
4806 echo "Create 150 hard links should succeed although the linkEA overflow"
4807 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4808 error "(2) Fail to hard link"
4810 cancel_lru_locks mdc
4811 if [ $MDSCOUNT -ge 2 ]; then
4812 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4813 error "(3.1) Migrate should fail"
4815 echo "The object with linkEA overflow should NOT be migrated"
4816 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4817 [ "$newfid" == "$oldfid" ] ||
4818 error "(3.2) Migrate should fail: $newfid != $oldfid"
4821 # Remove 100 hard links, then the linkEA should have space
4822 # to hold the missed linkEA entries.
4823 echo "Remove 100 hard links to save space for the missed linkEA entries"
4824 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4826 if [ $MDSCOUNT -ge 2 ]; then
4827 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4828 error "(5.1) Migrate should fail"
4830 # The overflow timestamp is still there, so migration will fail.
4831 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4832 [ "$newfid" == "$oldfid" ] ||
4833 error "(5.2) Migrate should fail: $newfid != $oldfid"
4836 # sleep 3 seconds to guarantee that the overflow is recognized
4839 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4840 $START_NAMESPACE -r -A ||
4841 error "(6) Fail to start LFSCK for namespace"
4843 wait_all_targets_blocked namespace completed 7
4845 local repaired=$($SHOW_NAMESPACE |
4846 awk '/^linkea_overflow_cleared/ { print $2 }')
4847 [ $repaired -eq 1 ] ||
4848 error "(8) Fail to clear linkea overflow: $repaired"
4850 repaired=$($SHOW_NAMESPACE |
4851 awk '/^nlinks_repaired/ { print $2 }')
4852 [ $repaired -eq 0 ] ||
4853 error "(9) Unexpected nlink repaired: $repaired"
4855 if [ $MDSCOUNT -ge 2 ]; then
4856 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4857 error "(10.1) Migrate failure"
4859 # Migration should succeed after clear the overflow timestamp.
4860 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4861 [ "$newfid" != "$oldfid" ] ||
4862 error "(10.2) Migrate should succeed"
4864 ls -l $DIR/$tdir/foo > /dev/null ||
4865 error "(11) 'ls' failed after migration"
4868 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4869 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4871 run_test 29c "verify linkEA size limitation"
4874 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4875 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4876 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4877 skip "MDS older than 2.6.50, LU-5518"
4880 echo "The namespace LFSCK will move the orphans from backend"
4881 echo "/lost+found directory to normal client visible namespace"
4882 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4885 check_mount_and_prep
4887 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4888 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4890 echo "Inject failure stub on MDT0 to simulate the case that"
4891 echo "directory d0 has no linkEA entry, then the LFSCK will"
4892 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4894 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4896 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4899 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4900 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4902 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4903 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4905 echo "Inject failure stub on MDT0 to simulate the case that the"
4906 echo "object's name entry will be removed, but not destroy the"
4907 echo "object. Then backend e2fsck will handle it as orphan and"
4908 echo "add them into the backend /lost+found directory."
4910 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4911 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4912 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4913 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4914 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4915 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4918 umount_client $MOUNT || error "(10) Fail to stop client!"
4920 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4922 local dev=$(facet_device $SINGLEMDS)
4924 echo "run e2fsck on $SINGLEMDS"
4925 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4926 error "(12) Fail to run e2fsck"
4928 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4930 echo "Trigger namespace LFSCK to recover backend orphans"
4931 $START_NAMESPACE -r -A ||
4932 error "(14) Fail to start LFSCK for namespace"
4934 wait_all_targets_blocked namespace completed 15
4936 local repaired=$($SHOW_NAMESPACE |
4937 awk '/^local_lost_found_moved/ { print $2 }')
4938 [ $repaired -ge 4 ] ||
4939 error "(16) Fail to recover backend orphans: $repaired"
4941 mount_client $MOUNT || error "(17) Fail to start client!"
4943 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4945 ls -ail $MOUNT/.lustre/lost+found/
4947 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4948 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4949 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4951 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4953 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4954 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4956 stat ${cname}/d1 || error "(21) d1 is not recovered"
4957 stat ${cname}/f1 || error "(22) f1 is not recovered"
4959 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4962 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4963 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4964 skip "MDS older than 2.6.50, LU-5519"
4967 echo "For the name entry under a striped directory, if the name"
4968 echo "hash does not match the shard, then the LFSCK will repair"
4969 echo "the bad name entry"
4972 check_mount_and_prep
4974 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4975 error "(1) Fail to create striped directory"
4977 echo "Inject failure stub on client to simulate the case that"
4978 echo "some name entry should be inserted into other non-first"
4979 echo "shard, but inserted into the first shard by wrong"
4981 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4982 $LCTL set_param fail_loc=0x1628 fail_val=0
4983 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4984 error "(2) Fail to create file under striped directory"
4985 $LCTL set_param fail_loc=0 fail_val=0
4987 echo "Trigger namespace LFSCK to repair bad name hash"
4988 $START_NAMESPACE -r -A ||
4989 error "(3) Fail to start LFSCK for namespace"
4991 wait_all_targets_blocked namespace completed 4
4993 local repaired=$($SHOW_NAMESPACE |
4994 awk '/^name_hash_repaired/ { print $2 }')
4995 [ $repaired -ge 1 ] ||
4996 error "(5) Fail to repair bad name hash: $repaired"
4998 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
5000 error "Fail to find flag bad type: $rc"
5002 umount_client $MOUNT || error "(6) umount failed"
5003 mount_client $MOUNT || error "(7) mount failed"
5005 for ((i = 0; i < $MDSCOUNT; i++)); do
5006 stat $DIR/$tdir/striped_dir/d$i ||
5007 error "(8) Fail to stat d$i after LFSCK"
5008 rmdir $DIR/$tdir/striped_dir/d$i ||
5009 error "(9) Fail to unlink d$i after LFSCK"
5012 rmdir $DIR/$tdir/striped_dir ||
5013 error "(10) Fail to remove the striped directory after LFSCK"
5015 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
5018 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5019 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5020 skip "MDS older than 2.6.50, LU-5519"
5023 echo "For the name entry under a striped directory, if the name"
5024 echo "hash does not match the shard, then the LFSCK will repair"
5025 echo "the bad name entry"
5028 check_mount_and_prep
5030 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5031 error "(1) Fail to create striped directory"
5033 echo "Inject failure stub on client to simulate the case that"
5034 echo "some name entry should be inserted into other non-second"
5035 echo "shard, but inserted into the secod shard by wrong"
5037 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5038 $LCTL set_param fail_loc=0x1628 fail_val=1
5039 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
5040 error "(2) Fail to create file under striped directory"
5041 $LCTL set_param fail_loc=0 fail_val=0
5043 echo "Trigger namespace LFSCK to repair bad name hash"
5044 $START_NAMESPACE -r -A ||
5045 error "(3) Fail to start LFSCK for namespace"
5047 wait_all_targets_blocked namespace completed 4
5049 local repaired=$(do_facet mds2 $LCTL get_param -n \
5050 mdd.$(facet_svc mds2).lfsck_namespace |
5051 awk '/^name_hash_repaired/ { print $2 }')
5052 echo "repaired $repaired name entries with bad hash"
5053 [ $repaired -ge 1 ] ||
5054 error "(5) Fail to repair bad name hash: $repaired"
5056 umount_client $MOUNT || error "(6) umount failed"
5057 mount_client $MOUNT || error "(7) mount failed"
5059 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5060 stat $DIR/$tdir/striped_dir/d$i ||
5061 error "(8) Fail to stat d$i after LFSCK"
5062 rmdir $DIR/$tdir/striped_dir/d$i ||
5063 error "(9) Fail to unlink d$i after LFSCK"
5066 rmdir $DIR/$tdir/striped_dir ||
5067 error "(10) Fail to remove the striped directory after LFSCK"
5069 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5072 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5073 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5074 skip "MDS older than 2.6.50, LU-5519"
5077 echo "For some reason, the master MDT-object of the striped directory"
5078 echo "may lost its master LMV EA. If nobody created files under the"
5079 echo "master directly after the master LMV EA lost, then the LFSCK"
5080 echo "should re-generate the master LMV EA."
5083 check_mount_and_prep
5085 echo "Inject failure stub on MDT0 to simulate the case that the"
5086 echo "master MDT-object of the striped directory lost the LMV EA."
5088 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5089 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5090 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5091 error "(1) Fail to create striped directory"
5092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5094 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5095 $START_NAMESPACE -r -A ||
5096 error "(2) Fail to start LFSCK for namespace"
5098 wait_all_targets_blocked namespace completed 3
5100 local repaired=$($SHOW_NAMESPACE |
5101 awk '/^striped_dirs_repaired/ { print $2 }')
5102 [ $repaired -eq 1 ] ||
5103 error "(4) Fail to re-generate master LMV EA: $repaired"
5105 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5106 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5108 umount_client $MOUNT || error "(5) umount failed"
5109 mount_client $MOUNT || error "(6) mount failed"
5111 local empty=$(ls $DIR/$tdir/striped_dir/)
5112 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5114 rmdir $DIR/$tdir/striped_dir ||
5115 error "(8) Fail to remove the striped directory after LFSCK"
5117 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5120 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5121 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5122 skip "MDS older than 2.6.50, LU-5519"
5125 echo "For some reason, the master MDT-object of the striped directory"
5126 echo "may lost its master LMV EA. If somebody created files under the"
5127 echo "master directly after the master LMV EA lost, then the LFSCK"
5128 echo "should NOT re-generate the master LMV EA, instead, it should"
5129 echo "change the broken striped dirctory as read-only to prevent"
5130 echo "further damage"
5133 check_mount_and_prep
5135 echo "Inject failure stub on MDT0 to simulate the case that the"
5136 echo "master MDT-object of the striped directory lost the LMV EA."
5138 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5139 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5140 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5141 error "(1) Fail to create striped directory"
5142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5144 umount_client $MOUNT || error "(2) umount failed"
5146 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5147 mount_client $MOUNT || error "(3) mount failed"
5149 touch $DIR/$tdir/striped_dir/dummy ||
5150 error "(4) Fail to touch under broken striped directory"
5152 echo "Trigger namespace LFSCK to find out the inconsistency"
5153 $START_NAMESPACE -r -A ||
5154 error "(5) Fail to start LFSCK for namespace"
5156 wait_all_targets_blocked namespace completed 6
5158 local repaired=$($SHOW_NAMESPACE |
5159 awk '/^striped_dirs_repaired/ { print $2 }')
5160 [ $repaired -eq 0 ] ||
5161 error "(7) Re-generate master LMV EA unexpected: $repaired"
5163 stat $DIR/$tdir/striped_dir/dummy ||
5164 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5166 touch $DIR/$tdir/striped_dir/foo &&
5167 error "(9) The broken striped directory should be read-only"
5169 chattr -i $DIR/$tdir/striped_dir ||
5170 error "(10) Fail to chattr on the broken striped directory"
5172 rm -f $DIR/$tdir/striped_dir/dummy || error "(11) Fail to remove dummy"
5174 # LFSCK again to regenerate master LMV
5175 echo "Trigger namespace LFSCK to find out the inconsistency"
5176 $START_NAMESPACE -r -A ||
5177 error "(12) Fail to start LFSCK for namespace"
5179 wait_all_targets_blocked namespace completed 6
5181 # reload striped_dir to parse newly generated LMV
5183 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5185 rmdir $DIR/$tdir/striped_dir ||
5186 error "(13) Fail to remove the striped directory after LFSCK"
5188 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5191 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5192 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5193 skip "MDS older than 2.6.50, LU-5519"
5196 echo "For some reason, the slave MDT-object of the striped directory"
5197 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5198 echo "slave LMV EA."
5201 check_mount_and_prep
5203 echo "Inject failure stub on MDT0 to simulate the case that the"
5204 echo "slave MDT-object (that resides on the same MDT as the master"
5205 echo "MDT-object resides on) lost the LMV EA."
5207 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5208 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5209 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5210 error "(1) Fail to create striped directory"
5211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5213 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5214 $START_NAMESPACE -r -A ||
5215 error "(2) Fail to start LFSCK for namespace"
5217 wait_all_targets_blocked namespace completed 3
5219 local repaired=$($SHOW_NAMESPACE |
5220 awk '/^striped_shards_repaired/ { print $2 }')
5221 [ $repaired -eq 1 ] ||
5222 error "(4) Fail to re-generate slave LMV EA: $repaired"
5224 rmdir $DIR/$tdir/striped_dir ||
5225 error "(5) Fail to remove the striped directory after LFSCK"
5227 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5230 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5231 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5232 skip "MDS older than 2.6.50, LU-5519"
5235 echo "For some reason, the slave MDT-object of the striped directory"
5236 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5237 echo "slave LMV EA."
5240 check_mount_and_prep
5242 echo "Inject failure stub on MDT0 to simulate the case that the"
5243 echo "slave MDT-object (that resides on different MDT as the master"
5244 echo "MDT-object resides on) lost the LMV EA."
5246 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5248 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5249 error "(1) Fail to create striped directory"
5250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5252 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5253 $START_NAMESPACE -r -A ||
5254 error "(2) Fail to start LFSCK for namespace"
5256 wait_all_targets_blocked namespace completed 3
5258 local repaired=$(do_facet mds2 $LCTL get_param -n \
5259 mdd.$(facet_svc mds2).lfsck_namespace |
5260 awk '/^striped_shards_repaired/ { print $2 }')
5261 [ $repaired -eq 1 ] ||
5262 error "(4) Fail to re-generate slave LMV EA: $repaired"
5264 rmdir $DIR/$tdir/striped_dir ||
5265 error "(5) Fail to remove the striped directory after LFSCK"
5267 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5270 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5271 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5272 skip "MDS older than 2.6.50, LU-5519"
5275 echo "For some reason, the stripe index in the slave LMV EA is"
5276 echo "corrupted. The LFSCK should repair the slave LMV EA."
5279 check_mount_and_prep
5281 echo "Inject failure stub on MDT0 to simulate the case that the"
5282 echo "slave LMV EA on the first shard of the striped directory"
5283 echo "claims the same index as the second shard claims"
5285 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5286 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5287 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5288 error "(1) Fail to create striped directory"
5289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5291 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5292 $START_NAMESPACE -r -A ||
5293 error "(2) Fail to start LFSCK for namespace"
5295 wait_all_targets_blocked namespace completed 3
5297 local repaired=$($SHOW_NAMESPACE |
5298 awk '/^striped_shards_repaired/ { print $2 }')
5299 [ $repaired -eq 1 ] ||
5300 error "(4) Fail to repair slave LMV EA: $repaired"
5302 umount_client $MOUNT || error "(5) umount failed"
5303 mount_client $MOUNT || error "(6) mount failed"
5305 touch $DIR/$tdir/striped_dir/foo ||
5306 error "(7) Fail to touch file after the LFSCK"
5308 rm -f $DIR/$tdir/striped_dir/foo ||
5309 error "(8) Fail to unlink file after the LFSCK"
5311 rmdir $DIR/$tdir/striped_dir ||
5312 error "(9) Fail to remove the striped directory after LFSCK"
5314 run_test 31g "Repair the corrupted slave LMV EA"
5317 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5318 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5319 skip "MDS older than 2.6.50, LU-5519"
5322 echo "For some reason, the shard's name entry in the striped"
5323 echo "directory may be corrupted. The LFSCK should repair the"
5324 echo "bad shard's name entry."
5327 check_mount_and_prep
5329 echo "Inject failure stub on MDT0 to simulate the case that the"
5330 echo "first shard's name entry in the striped directory claims"
5331 echo "the same index as the second shard's name entry claims."
5333 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5335 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5336 error "(1) Fail to create striped directory"
5337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5339 echo "Trigger namespace LFSCK to repair the shard's name entry"
5340 $START_NAMESPACE -r -A ||
5341 error "(2) Fail to start LFSCK for namespace"
5343 wait_all_targets_blocked namespace completed 3
5345 local repaired=$($SHOW_NAMESPACE |
5346 awk '/^dirent_repaired/ { print $2 }')
5347 [ $repaired -eq 1 ] ||
5348 error "(4) Fail to repair shard's name entry: $repaired"
5350 umount_client $MOUNT || error "(5) umount failed"
5351 mount_client $MOUNT || error "(6) mount failed"
5353 touch $DIR/$tdir/striped_dir/foo ||
5354 error "(7) Fail to touch file after the LFSCK"
5356 rm -f $DIR/$tdir/striped_dir/foo ||
5357 error "(8) Fail to unlink file after the LFSCK"
5359 rmdir $DIR/$tdir/striped_dir ||
5360 error "(9) Fail to remove the striped directory after LFSCK"
5362 run_test 31h "Repair the corrupted shard's name entry"
5367 umount_client $MOUNT
5369 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5370 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5371 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5373 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5374 [ "$STATUS" == "scanning-phase1" ] ||
5375 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5378 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5384 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5386 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5387 error "(5) Fail to start ost1"
5389 run_test 32a "stop LFSCK when some OST failed"
5393 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5396 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5397 error "(1) Fail to create $DIR/$tdir/dp"
5398 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5399 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5400 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5401 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5402 umount_client $MOUNT
5404 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5405 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5406 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5408 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5409 mdd.${MDT_DEV}.lfsck_namespace |
5410 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5412 error "(5) unexpected status"
5416 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5418 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5422 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5424 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5425 error "(8) Fail to start MDT2"
5427 run_test 32b "stop LFSCK when some MDT failed"
5433 $START_LAYOUT --dryrun -o -r ||
5434 error "(1) Fail to start layout LFSCK"
5435 wait_all_targets_blocked layout completed 2
5437 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5438 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5439 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5441 $START_NAMESPACE -e abort -A -r ||
5442 error "(4) Fail to start namespace LFSCK"
5443 wait_all_targets_blocked namespace completed 5
5445 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5446 [ "$PARAMS" == "failout,all_targets" ] ||
5447 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5449 run_test 33 "check LFSCK paramters"
5453 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5454 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5458 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5459 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5460 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5461 error "(1) Fail to create $DIR/$tdir/dummy"
5463 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5464 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5465 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5466 mdd.${MDT_DEV}.lfsck_namespace |
5467 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5469 error "(3) unexpected status"
5472 local repaired=$($SHOW_NAMESPACE |
5473 awk '/^dirent_repaired/ { print $2 }')
5474 [ $repaired -eq 1 ] ||
5475 error "(4) Fail to repair the lost agent object: $repaired"
5477 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5478 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5479 mdd.${MDT_DEV}.lfsck_namespace |
5480 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5482 error "(6) unexpected status"
5485 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5486 [ $repaired -eq 0 ] ||
5487 error "(7) Unexpected repairing: $repaired"
5489 run_test 34 "LFSCK can rebuild the lost agent object"
5493 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5497 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5498 do_facet mds2 $LCTL set_param fail_loc=0x1631
5499 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5500 error "(1) Fail to create $DIR/$tdir/dummy"
5503 do_facet mds2 $LCTL set_param fail_loc=0
5504 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5505 wait_update_facet mds2 "$LCTL get_param -n \
5506 mdd.$(facet_svc mds2).lfsck_namespace |
5507 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5508 error "(3) MDS${k} is not the expected 'completed'"
5510 local repaired=$(do_facet mds2 $LCTL get_param -n \
5511 mdd.$(facet_svc mds2).lfsck_namespace |
5512 awk '/^agent_entries_repaired/ { print $2 }')
5513 [ $repaired -eq 1 ] ||
5514 error "(4) Fail to repair the lost agent entry: $repaired"
5516 echo "stopall to cleanup object cache"
5519 setupall > /dev/null
5521 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5522 wait_update_facet mds2 "$LCTL get_param -n \
5523 mdd.$(facet_svc mds2).lfsck_namespace |
5524 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5525 error "(6) MDS${k} is not the expected 'completed'"
5527 repaired=$(do_facet mds2 $LCTL get_param -n \
5528 mdd.$(facet_svc mds2).lfsck_namespace |
5529 awk '/^agent_entries_repaired/ { print $2 }')
5530 [ $repaired -eq 0 ] ||
5531 error "(7) Unexpected repairing: $repaired"
5533 run_test 35 "LFSCK can rebuild the lost agent entry"
5536 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5539 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5540 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5541 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5544 check_mount_and_prep
5548 lctl get_param osc.*.*grant*
5549 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5551 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5552 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5553 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5554 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5555 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5556 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5557 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5558 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5559 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5560 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5561 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5562 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5564 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5565 error "(3) Fail to write $DIR/$tdir/f0"
5566 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5567 error "(4) Fail to write $DIR/$tdir/f1"
5568 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5569 error "(5) Fail to write $DIR/$tdir/f2"
5571 $LFS mirror resync $DIR/$tdir/f0 ||
5572 error "(6) Fail to resync $DIR/$tdir/f0"
5573 $LFS mirror resync $DIR/$tdir/f1 ||
5574 error "(7) Fail to resync $DIR/$tdir/f1"
5575 $LFS mirror resync $DIR/$tdir/f2 ||
5576 error "(8) Fail to resync $DIR/$tdir/f2"
5578 cancel_lru_locks mdc
5579 cancel_lru_locks osc
5581 $LFS getstripe $DIR/$tdir/f0 ||
5582 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5583 $LFS getstripe $DIR/$tdir/f1 ||
5584 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5585 $LFS getstripe $DIR/$tdir/f2 ||
5586 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5588 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5589 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5590 do_facet mds1 $LCTL set_param fail_loc=0x1616
5592 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5593 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5594 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5595 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5596 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5597 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5601 do_facet mds1 $LCTL set_param fail_loc=0
5603 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5604 error "(15) The 1st of mirror is not destroyed"
5605 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5606 error "(16) The 2nd of mirror is not destroyed"
5607 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5608 error "(17) The 3rd of mirror is not destroyed"
5612 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5613 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5614 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5615 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5616 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5617 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5619 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5620 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5622 for k in $(seq $MDSCOUNT); do
5623 # The LFSCK status query internal is 30 seconds. For the case
5624 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5625 # time to guarantee the status sync up.
5626 wait_update_facet mds${k} "$LCTL get_param -n \
5627 mdd.$(facet_svc mds${k}).lfsck_layout |
5628 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5629 error "(22) MDS${k} is not the expected 'completed'"
5632 for k in $(seq $OSTCOUNT); do
5633 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5634 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5635 awk '/^status/ { print $2 }')
5636 [ "$cur_status" == "completed" ] ||
5637 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5640 local repaired=$(do_facet mds1 $LCTL get_param -n \
5641 mdd.$(facet_svc mds1).lfsck_layout |
5642 awk '/^repaired_orphan/ { print $2 }')
5643 [ $repaired -eq 9 ] ||
5644 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5646 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5647 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5648 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5649 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5650 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5651 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5653 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5654 $LFS getstripe $DIR/$tdir/f0
5655 error "(28) The 1st of mirror is not recovered"
5658 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5659 $LFS getstripe $DIR/$tdir/f1
5660 error "(29) The 2nd of mirror is not recovered"
5663 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5664 $LFS getstripe $DIR/$tdir/f2
5665 error "(30) The 3rd of mirror is not recovered"
5668 run_test 36a "rebuild LOV EA for mirrored file (1)"
5671 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5672 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5675 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5676 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5677 echo "with the PFID EA of related OST-object(s) belong to the file. "
5680 check_mount_and_prep
5682 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5683 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5684 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5685 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5687 local fid=$($LFS path2fid $DIR/$tdir/f0)
5689 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5690 error "(1) Fail to write $DIR/$tdir/f0"
5691 $LFS mirror resync $DIR/$tdir/f0 ||
5692 error "(2) Fail to resync $DIR/$tdir/f0"
5694 cancel_lru_locks mdc
5695 cancel_lru_locks osc
5697 $LFS getstripe $DIR/$tdir/f0 ||
5698 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5700 echo "Inject failure, to simulate the case of missing the MDT-object"
5701 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5702 do_facet mds1 $LCTL set_param fail_loc=0x1616
5703 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5707 do_facet mds1 $LCTL set_param fail_loc=0
5709 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5710 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5712 for k in $(seq $MDSCOUNT); do
5713 # The LFSCK status query internal is 30 seconds. For the case
5714 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5715 # time to guarantee the status sync up.
5716 wait_update_facet mds${k} "$LCTL get_param -n \
5717 mdd.$(facet_svc mds${k}).lfsck_layout |
5718 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5719 error "(6) MDS${k} is not the expected 'completed'"
5722 for k in $(seq $OSTCOUNT); do
5723 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5724 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5725 awk '/^status/ { print $2 }')
5726 [ "$cur_status" == "completed" ] ||
5727 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5730 local count=$(do_facet mds1 $LCTL get_param -n \
5731 mdd.$(facet_svc mds1).lfsck_layout |
5732 awk '/^repaired_orphan/ { print $2 }')
5733 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5735 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5736 count=$($LFS getstripe --mirror-count $name)
5737 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5739 count=$($LFS getstripe --component-count $name)
5740 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5742 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5743 $LFS getstripe $name
5744 error "(11) The 1st of mirror is not recovered"
5747 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5748 $LFS getstripe $name
5749 error "(12) The 2nd of mirror is not recovered"
5752 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5753 $LFS getstripe $name
5754 error "(13) The 3rd of mirror is not recovered"
5757 run_test 36b "rebuild LOV EA for mirrored file (2)"
5760 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5761 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5764 echo "The mirrored file has been modified, not resynced yet, then "
5765 echo "lost its MDT-object, but relatd OST-objects are still there. "
5766 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5767 echo "with the PFID EA of related OST-object(s) belong to the file. "
5770 check_mount_and_prep
5772 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5773 -N -E 2M -S1M -o 1,2 -E -1 -o 0 $DIR/$tdir/f0 ||
5774 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5776 local fid=$($LFS path2fid $DIR/$tdir/f0)
5778 # The 1st dd && resync makes all related OST-objects have been written
5779 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5780 error "(1.1) Fail to write $DIR/$tdir/f0"
5781 $LFS mirror resync $DIR/$tdir/f0 ||
5782 error "(1.2) Fail to resync $DIR/$tdir/f0"
5783 # The 2nd dd makes one mirror to be stale
5784 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5785 error "(1.3) Fail to write $DIR/$tdir/f0"
5787 cancel_lru_locks mdc
5788 cancel_lru_locks osc
5790 $LFS getstripe $DIR/$tdir/f0 ||
5791 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5793 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5794 awk '/lcme_flags/ { print $2 }')
5795 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5796 awk '/lcme_flags/ { print $2 }')
5798 echo "Inject failure, to simulate the case of missing the MDT-object"
5799 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5800 do_facet mds1 $LCTL set_param fail_loc=0x1616
5801 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5805 do_facet mds1 $LCTL set_param fail_loc=0
5807 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5808 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5810 for k in $(seq $MDSCOUNT); do
5811 # The LFSCK status query internal is 30 seconds. For the case
5812 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5813 # time to guarantee the status sync up.
5814 wait_update_facet mds${k} "$LCTL get_param -n \
5815 mdd.$(facet_svc mds${k}).lfsck_layout |
5816 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5817 error "(5) MDS${k} is not the expected 'completed'"
5820 for k in $(seq $OSTCOUNT); do
5821 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5822 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5823 awk '/^status/ { print $2 }')
5824 [ "$cur_status" == "completed" ] ||
5825 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5828 local count=$(do_facet mds1 $LCTL get_param -n \
5829 mdd.$(facet_svc mds1).lfsck_layout |
5830 awk '/^repaired_orphan/ { print $2 }')
5831 [ $count -eq 6 ] || error "(7) Expect 6 fixed on mds1, but got: $count"
5833 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5834 count=$($LFS getstripe --mirror-count $name)
5835 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5837 count=$($LFS getstripe --component-count $name)
5838 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5840 local flags=$($LFS getstripe $name | head -n 10 |
5841 awk '/lcme_flags/ { print $2 }')
5842 [ "$flags" == "$saved_flags1" ] || {
5843 $LFS getstripe $name
5844 error "(10) expect flags $saved_flags1, got $flags"
5847 flags=$($LFS getstripe $name | tail -n 10 |
5848 awk '/lcme_flags/ { print $2 }')
5849 [ "$flags" == "$saved_flags2" ] || {
5850 $LFS getstripe $name
5851 error "(11) expect flags $saved_flags2, got $flags"
5854 run_test 36c "rebuild LOV EA for mirrored file (3)"
5860 local t_dir="$DIR/$tdir/d0"
5861 check_mount_and_prep
5863 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5864 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5868 $START_NAMESPACE -r -A || {
5869 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5871 wait_all_targets_blocked namespace completed 4
5876 run_test 37 "LFSCK must skip a ORPHAN"
5880 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5881 skip "Need MDS version newer than 2.12.51"
5883 # skip basic ops on file with foreign LOV tests on 5.12-6.2 kernels
5884 # until the filemap_read() issue is fixed by v6.2-rc4-61-g5956592ce337
5885 (( $LINUX_VERSION_CODE < $(version_code 5.12.0) ||
5886 $LINUX_VERSION_CODE >= $(version_code 6.2.0) )) ||
5887 skip "Need kernel < 5.12.0 or >= 6.2.0 for filemap_read() fix"
5889 test_mkdir $DIR/$tdir
5890 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5891 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5893 # create foreign file
5894 $LFS setstripe --foreign=none --flags 0xda05 \
5895 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5896 error "$DIR/$tdir/$tfile: create failed"
5898 $LFS getstripe -v $DIR/$tdir/$tfile |
5899 grep "lfm_magic:.*0x0BD70BD0" ||
5900 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5901 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5902 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5903 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5904 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5905 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5906 $LFS getstripe -v $DIR/$tdir/$tfile |
5907 grep "lfm_flags:.*0x0000DA05" ||
5908 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5909 $LFS getstripe $DIR/$tdir/$tfile |
5910 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5911 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5913 # modify striping should fail
5914 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5915 error "$DIR/$tdir/$tfile: setstripe should fail"
5917 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5919 wait_all_targets_blocked namespace completed 1
5921 # check that "global" namespace_repaired == 0 !!!
5922 local repaired=$(do_facet mds1 \
5923 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5924 awk '/^namespace_repaired/ { print \\\$2 }'")
5925 [ $repaired -eq 0 ] ||
5926 error "(2) Expect no namespace repair, but got: $repaired"
5928 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5930 wait_all_targets_blocked layout completed 2
5932 # check that "global" layout_repaired == 0 !!!
5933 local repaired=$(do_facet mds1 \
5934 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5935 awk '/^layout_repaired/ { print \\\$2 }'")
5936 [ $repaired -eq 0 ] ||
5937 error "(2) Expect no layout repair, but got: $repaired"
5939 echo "post-lfsck checks of foreign file"
5941 $LFS getstripe -v $DIR/$tdir/$tfile |
5942 grep "lfm_magic:.*0x0BD70BD0" ||
5943 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5944 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5945 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5946 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5947 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5948 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5949 $LFS getstripe -v $DIR/$tdir/$tfile |
5950 grep "lfm_flags:.*0x0000DA05" ||
5951 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5952 $LFS getstripe $DIR/$tdir/$tfile |
5953 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5954 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5956 # modify striping should fail
5957 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5958 error "$DIR/$tdir/$tfile: setstripe should fail"
5961 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5962 cat /etc/passwd > $DIR/$tdir/$tfile &&
5963 error "$DIR/$tdir/$tfile: write should fail"
5965 #remove foreign file
5966 rm $DIR/$tdir/$tfile ||
5967 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5969 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5973 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5974 skip "Need MDS version newer than 2.12.51"
5976 test_mkdir $DIR/$tdir
5977 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5978 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5980 # create foreign dir
5981 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5982 $DIR/$tdir/${tdir}2 ||
5983 error "$DIR/$tdir/${tdir}2: create failed"
5985 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5986 grep "lfm_magic:.*0x0CD50CD0" ||
5987 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5988 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5989 # - sizeof(lfm_type) - sizeof(lfm_flags)
5990 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5991 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5992 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5993 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5994 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5995 grep "lfm_flags:.*0x0000DA05" ||
5996 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5997 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5998 grep "lfm_value.*${uuid1}@${uuid2}" ||
5999 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6001 # file create in dir should fail
6002 touch $DIR/$tdir/${tdir}2/$tfile &&
6003 "$DIR/${tdir}2: file create should fail"
6006 chmod 777 $DIR/$tdir/${tdir}2 ||
6007 error "$DIR/${tdir}2: chmod failed"
6010 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6011 error "$DIR/${tdir}2: chown failed"
6013 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
6015 wait_all_targets_blocked namespace completed 1
6017 # check that "global" namespace_repaired == 0 !!!
6018 local repaired=$(do_facet mds1 \
6019 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6020 awk '/^namespace_repaired/ { print \\\$2 }'")
6021 [ $repaired -eq 0 ] ||
6022 error "(2) Expect nothing to be repaired, but got: $repaired"
6024 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
6026 wait_all_targets_blocked layout completed 2
6028 # check that "global" layout_repaired == 0 !!!
6029 local repaired=$(do_facet mds1 \
6030 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6031 awk '/^layout_repaired/ { print \\\$2 }'")
6032 [ $repaired -eq 0 ] ||
6033 error "(2) Expect no layout repair, but got: $repaired"
6035 echo "post-lfsck checks of foreign dir"
6037 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6038 grep "lfm_magic:.*0x0CD50CD0" ||
6039 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6040 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6041 # - sizeof(lfm_type) - sizeof(lfm_flags)
6042 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6043 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6044 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6045 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6046 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6047 grep "lfm_flags:.*0x0000DA05" ||
6048 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6049 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6050 grep "lfm_value.*${uuid1}@${uuid2}" ||
6051 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6053 # file create in dir should fail
6054 touch $DIR/$tdir/${tdir}2/$tfile &&
6055 "$DIR/${tdir}2: file create should fail"
6058 chmod 777 $DIR/$tdir/${tdir}2 ||
6059 error "$DIR/${tdir}2: chmod failed"
6062 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6063 error "$DIR/${tdir}2: chown failed"
6066 rmdir $DIR/$tdir/${tdir}2 ||
6067 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
6069 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
6072 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6074 check_mount_and_prep
6075 $LFS mkdir -i 1 $DIR/$tdir/dir1
6076 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6078 touch $DIR/$tdir/dir1/f1
6079 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6081 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6082 $LFS migrate -m 0 $DIR/$tdir/dir1
6084 echo "trigger LFSCK for layout"
6085 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6087 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6088 mdd.${MDT_DEV}.lfsck_layout |
6089 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6091 error "(2) unexpected status"
6094 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6096 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6098 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6102 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6104 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6105 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6106 do_facet $SINGLEMDS $LCTL dk > /dev/null
6108 echo "trigger LFSCK for SEL layout"
6109 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6110 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6111 mdd.${MDT_DEV}.lfsck_layout |
6112 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6114 error "(2) unexpected status"
6117 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6118 grep "lfsck_layout_verify_header")
6120 [[ "x$errors" == "x" ]] || {
6122 error "lfsck failed"
6125 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6127 run_test 41 "SEL support in LFSCK"
6130 local mode='\x00\x00\x00\x00'
6131 local raw="$(printf ""\\\\x%02x"" {0..63})"
6135 [[ $(lscpu) =~ Byte\ Order.*Little ]] && size='\x40\x00\x00\x00' ||
6136 size='\x00\x00\x00\x40'
6137 key="${mode}${raw}${size}"
6138 echo -n -e "${key}" | keyctl padd logon fscrypt:4242424242424242 @s
6143 sync ; echo 3 > /proc/sys/vm/drop_caches
6150 $LCTL set_param -n ldlm.namespaces.*.lru_size=clear
6151 sync ; echo 3 > /proc/sys/vm/drop_caches
6152 dummy_key=$(keyctl show | awk '$7 ~ "^fscrypt:" {print $1}')
6153 if [ -n "$dummy_key" ]; then
6154 keyctl revoke $dummy_key
6159 remount_client_normally() {
6160 # remount client without dummy encryption key
6161 if is_mounted $MOUNT; then
6162 umount_client $MOUNT || error "umount $MOUNT failed"
6164 mount_client $MOUNT ${MOUNT_OPTS} ||
6165 error "remount failed"
6167 if is_mounted $MOUNT2; then
6168 umount_client $MOUNT2 || error "umount $MOUNT2 failed"
6170 if [ "$MOUNT_2" ]; then
6171 mount_client $MOUNT2 ${MOUNT_OPTS} ||
6172 error "remount failed"
6178 remount_client_dummykey() {
6181 # remount client with dummy encryption key
6182 if is_mounted $MOUNT; then
6183 umount_client $MOUNT || error "umount $MOUNT failed"
6185 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6186 error "remount failed"
6189 setup_for_enc_tests() {
6190 rm -rf $DIR/[df][0-9]* || error "Fail to cleanup env"
6192 # remount client with test_dummy_encryption option
6193 if is_mounted $MOUNT; then
6194 umount_client $MOUNT || error "umount $MOUNT failed"
6196 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6197 error "mount with '-o test_dummy_encryption' failed"
6199 # this directory will be encrypted, because of dummy mode
6200 $LFS setdirstripe -c 1 -i 0 $DIR/$tdir
6201 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6204 cleanup_for_enc_tests() {
6205 rm -rf $DIR/$tdir $*
6207 remount_client_normally
6211 [[ $(facet_fstype ost1) == zfs ]] && skip "skip ZFS backend"
6213 (( $MDS1_VERSION > $(version_code 2.15.51) )) ||
6214 skip "Need MDS version at least 2.15.51"
6217 echo "If the MDT-object has the encryption flag but the OST-object"
6218 echo "does not, add it to the OST-object."
6221 check_mount_and_prep
6223 $LCTL get_param mdc.*.import | grep -q client_encryption ||
6224 skip "client encryption not supported"
6226 mount.lustre --help |& grep -q "test_dummy_encryption:" ||
6227 skip "need dummy encryption support"
6229 stack_trap cleanup_for_enc_tests EXIT
6232 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6233 touch $DIR/$tdir/${tfile}_1 || error "touch ${tfile}_1 failed"
6234 dd if=/dev/zero of=$DIR/$tdir/${tfile}_2 bs=1 count=1 conv=fsync ||
6235 error "dd ${tfile}_2 failed"
6237 #define OBD_FAIL_LFSCK_NO_ENCFLAG 0x1632
6238 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x1632"
6239 touch $DIR/$tdir/${tfile}_3 || error "touch ${tfile}_3 failed"
6240 dd if=/dev/zero of=$DIR/$tdir/${tfile}_4 bs=1 count=1 conv=fsync ||
6241 error "dd ${tfile}_4 failed"
6242 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x0"
6243 cancel_lru_locks osc
6245 echo "Trigger layout LFSCK to find out inconsistent OST-object enc flag"
6247 $START_LAYOUT -r || error "Fail to start LFSCK for layout!"
6249 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6250 mdd.${MDT_DEV}.lfsck_layout |
6251 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6253 error "unexpected lfsck status"
6256 local repaired=$($SHOW_LAYOUT |
6257 awk '/^repaired_others/ { print $2 }')
6258 [ $repaired -eq 2 ] ||
6259 error "Fail to repair inconsistent enc flag: $repaired"
6261 run_test 42 "LFSCK can repair inconsistent MDT-object/OST-object encryption flags"
6263 # restore MDS/OST size
6264 MDSSIZE=${SAVED_MDSSIZE}
6265 OSTSIZE=${SAVED_OSTSIZE}
6266 OSTCOUNT=${SAVED_OSTCOUNT}
6268 # cleanup the system at last
6269 REFORMAT="yes" cleanup_and_setup_lustre
6271 complete_test $SECONDS
6272 check_and_cleanup_lustre