3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
345 touch $DIR/$tdir/dummy
347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
349 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
350 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
351 mdd.${MDT_DEV}.lfsck_namespace |
352 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
354 error "(4) unexpected status"
357 local repaired=$($SHOW_NAMESPACE |
358 awk '/^linkea_repaired/ { print $2 }')
359 # for interop with old server
360 [ -z "$repaired" ] &&
361 repaired=$($SHOW_NAMESPACE |
362 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
367 run_e2fsck_on_mds_facet $SINGLEMDS
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
404 run_e2fsck_on_mds_facet $SINGLEMDS
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
420 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
421 skip "MDS older than 2.4.90"
425 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
427 touch $DIR/$tdir/dummy
429 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
431 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
432 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
433 mdd.${MDT_DEV}.lfsck_namespace |
434 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
436 error "(4) unexpected status"
439 local repaired=$($SHOW_NAMESPACE |
440 awk '/^updated_phase2/ { print $2 }')
441 [ $repaired -eq 1 ] ||
442 error "(5) Fail to repair crashed linkEA: $repaired"
444 run_e2fsck_on_mds_facet $SINGLEMDS
446 mount_client $MOUNT || error "(6) Fail to start client!"
448 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
449 error "(7) Fail to stat $DIR/$tdir/dummy"
451 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
452 local dummyname=$($LFS fid2path $DIR $dummyfid)
453 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
454 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
456 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
460 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
461 skip "MDS older than 2.6.50, LU-4788"
465 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
467 touch $DIR/$tdir/dummy
469 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
471 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
472 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
473 mdd.${MDT_DEV}.lfsck_namespace |
474 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
476 error "(4) unexpected status"
479 local repaired=$($SHOW_NAMESPACE |
480 awk '/^linkea_repaired/ { print $2 }')
481 [ $repaired -eq 1 ] ||
482 error "(5) Fail to repair crashed linkEA: $repaired"
484 run_e2fsck_on_mds_facet $SINGLEMDS
486 mount_client $MOUNT || error "(6) Fail to start client!"
488 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
489 error "(7) Fail to stat $DIR/$tdir/dummy"
491 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
492 local dummyname=$($LFS fid2path $DIR $dummyfid)
493 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
494 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
496 run_test 2d "LFSCK can recover the missing linkEA entry"
500 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
501 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
502 skip "MDS older than 2.6.50, LU-5511"
506 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
508 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
509 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
510 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
511 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
513 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
515 wait_all_targets_blocked namespace completed 4
517 local repaired=$($SHOW_NAMESPACE |
518 awk '/^linkea_repaired/ { print $2 }')
519 [ $repaired -eq 1 ] ||
520 error "(5) Fail to repair crashed linkEA: $repaired"
522 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
523 local name=$($LFS fid2path $DIR $fid)
524 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
525 error "(6) Fail to repair linkEA: $fid $name"
527 run_test 2e "namespace LFSCK can verify remote object linkEA"
531 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
532 skip "MDS older than 2.6.50, LU-4788"
536 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
537 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
538 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
540 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
541 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
542 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
544 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
545 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
546 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
548 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
549 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
550 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
552 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
554 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
555 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
556 mdd.${MDT_DEV}.lfsck_namespace |
557 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
559 error "(10) unexpected status"
562 local checked=$($SHOW_NAMESPACE |
563 awk '/^checked_phase2/ { print $2 }')
564 [ $checked -ge 4 ] ||
565 error "(11) Fail to check multiple-linked object: $checked"
567 local repaired=$($SHOW_NAMESPACE |
568 awk '/^multiple_linked_repaired/ { print $2 }')
569 [ $repaired -ge 2 ] ||
570 error "(12) Fail to repair multiple-linked object: $repaired"
572 run_test 3 "LFSCK can verify multiple-linked objects"
576 [ "$mds1_FSTYPE" != ldiskfs ] &&
577 skip "OI Scrub not implemented for ZFS"
580 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
581 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
583 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
584 echo "start $SINGLEMDS with disabling OI scrub"
585 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
587 #define OBD_FAIL_LFSCK_DELAY2 0x1601
588 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
589 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
590 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
591 mdd.${MDT_DEV}.lfsck_namespace |
592 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
594 error "(5) unexpected status"
597 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
598 [ "$STATUS" == "scanning-phase1" ] ||
599 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
601 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
602 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
603 mdd.${MDT_DEV}.lfsck_namespace |
604 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
606 error "(7) unexpected status"
609 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
610 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
612 local repaired=$($SHOW_NAMESPACE |
613 awk '/^dirent_repaired/ { print $2 }')
614 # for interop with old server
615 [ -z "$repaired" ] &&
616 repaired=$($SHOW_NAMESPACE |
617 awk '/^updated_phase1/ { print $2 }')
619 [ $repaired -ge 9 ] ||
620 error "(9) Fail to re-generate FID-in-dirent: $repaired"
622 run_e2fsck_on_mds_facet $SINGLEMDS
624 mount_client $MOUNT || error "(10) Fail to start client!"
626 #define OBD_FAIL_FID_LOOKUP 0x1505
627 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
628 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
631 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
635 [ "$mds1_FSTYPE" != ldiskfs ] &&
636 skip "OI Scrub not implemented for ZFS"
639 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
640 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
642 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
643 echo "start $SINGLEMDS with disabling OI scrub"
644 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
646 #define OBD_FAIL_LFSCK_DELAY2 0x1601
647 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
648 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
649 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
650 mdd.${MDT_DEV}.lfsck_namespace |
651 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
653 error "(5) unexpected status"
656 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
657 [ "$STATUS" == "scanning-phase1" ] ||
658 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
661 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
662 mdd.${MDT_DEV}.lfsck_namespace |
663 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
665 error "(7) unexpected status"
668 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
669 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
671 local repaired=$($SHOW_NAMESPACE |
672 awk '/^dirent_repaired/ { print $2 }')
673 # for interop with old server
674 [ -z "$repaired" ] &&
675 repaired=$($SHOW_NAMESPACE |
676 awk '/^updated_phase1/ { print $2 }')
678 [ $repaired -ge 2 ] ||
679 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
681 run_e2fsck_on_mds_facet $SINGLEMDS
683 mount_client $MOUNT || error "(10) Fail to start client!"
685 #define OBD_FAIL_FID_LOOKUP 0x1505
686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
687 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
689 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
692 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
693 local dummyname=$($LFS fid2path $DIR $dummyfid)
694 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
695 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
697 run_test 5 "LFSCK can handle IGIF object upgrading"
702 #define OBD_FAIL_LFSCK_DELAY1 0x1600
703 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
704 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
706 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
707 [ "$STATUS" == "scanning-phase1" ] ||
708 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
710 # Sleep 3 sec to guarantee at least one object processed by LFSCK
712 # Fail the LFSCK to guarantee there is at least one checkpoint
713 #define OBD_FAIL_LFSCK_FATAL1 0x1608
714 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
715 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
716 mdd.${MDT_DEV}.lfsck_namespace |
717 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
719 error "(4) unexpected status"
722 local POS0=$($SHOW_NAMESPACE |
723 awk '/^last_checkpoint_position/ { print $2 }' |
726 #define OBD_FAIL_LFSCK_DELAY1 0x1600
727 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
728 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
730 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
731 [ "$STATUS" == "scanning-phase1" ] ||
732 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
734 local POS1=$($SHOW_NAMESPACE |
735 awk '/^latest_start_position/ { print $2 }' |
737 [[ $POS0 -lt $POS1 ]] ||
738 error "(7) Expect larger than: $POS0, but got $POS1"
740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
741 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
742 mdd.${MDT_DEV}.lfsck_namespace |
743 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
745 error "(8) unexpected status"
748 run_test 6a "LFSCK resumes from last checkpoint (1)"
753 #define OBD_FAIL_LFSCK_DELAY2 0x1601
754 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
755 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
757 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
758 [ "$STATUS" == "scanning-phase1" ] ||
759 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
761 # Sleep 5 sec to guarantee that we are in the directory scanning
763 # Fail the LFSCK to guarantee there is at least one checkpoint
764 #define OBD_FAIL_LFSCK_FATAL2 0x1609
765 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
767 mdd.${MDT_DEV}.lfsck_namespace |
768 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
770 error "(4) unexpected status"
773 local O_POS0=$($SHOW_NAMESPACE |
774 awk '/^last_checkpoint_position/ { print $2 }' |
777 local D_POS0=$($SHOW_NAMESPACE |
778 awk '/^last_checkpoint_position/ { print $4 }')
780 #define OBD_FAIL_LFSCK_DELAY2 0x1601
781 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
782 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
784 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
785 [ "$STATUS" == "scanning-phase1" ] ||
786 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
788 local O_POS1=$($SHOW_NAMESPACE |
789 awk '/^latest_start_position/ { print $2 }' |
791 local D_POS1=$($SHOW_NAMESPACE |
792 awk '/^latest_start_position/ { print $4 }')
794 echo "Additional debug for 6b"
796 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
797 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
798 [[ $O_POS0 -lt $O_POS1 ]] ||
799 error "(7.1) $O_POS1 is not larger than $O_POS0"
801 [[ $D_POS0 -lt $D_POS1 ]] ||
802 error "(7.2) $D_POS1 is not larger than $D_POS0"
805 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
806 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
807 mdd.${MDT_DEV}.lfsck_namespace |
808 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
810 error "(8) unexpected status"
813 run_test 6b "LFSCK resumes from last checkpoint (2)"
820 #define OBD_FAIL_LFSCK_DELAY2 0x1601
821 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
822 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
824 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
825 [ "$STATUS" == "scanning-phase1" ] ||
826 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
828 # Sleep 3 sec to guarantee at least one object processed by LFSCK
830 echo "stop $SINGLEMDS"
831 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
834 echo "start $SINGLEMDS"
835 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
837 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
838 mdd.${MDT_DEV}.lfsck_namespace |
839 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
841 error "(6) unexpected status"
844 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
850 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
852 for ((i = 0; i < 20; i++)); do
853 touch $DIR/$tdir/dummy${i}
856 #define OBD_FAIL_LFSCK_DELAY3 0x1602
857 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
858 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
859 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
860 mdd.${MDT_DEV}.lfsck_namespace |
861 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
863 error "(4) unexpected status"
867 echo "stop $SINGLEMDS"
868 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
871 echo "start $SINGLEMDS"
872 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
874 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
875 mdd.${MDT_DEV}.lfsck_namespace |
876 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
878 error "(7) unexpected status"
881 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
892 formatall > /dev/null
898 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
899 [ "$STATUS" == "init" ] ||
900 namespace_error "(2) Expect 'init', but got '$STATUS'"
902 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
904 mkdir $DIR/$tdir/crashed
906 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
908 for ((i = 0; i < 5; i++)); do
909 touch $DIR/$tdir/dummy${i}
912 umount_client $MOUNT || error "(3) Fail to stop client!"
914 #define OBD_FAIL_LFSCK_DELAY2 0x1601
915 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
917 namespace_error "(4) Fail to start LFSCK for namespace!"
919 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
920 [ "$STATUS" == "scanning-phase1" ] ||
921 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
923 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
925 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
926 [ "$STATUS" == "stopped" ] ||
927 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
930 namespace_error "(8) Fail to start LFSCK for namespace!"
932 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
933 [ "$STATUS" == "scanning-phase1" ] ||
934 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
936 #define OBD_FAIL_LFSCK_FATAL2 0x1609
937 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
942 namespace_error "(10) unexpected status"
945 #define OBD_FAIL_LFSCK_DELAY1 0x1600
946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
948 namespace_error "(11) Fail to start LFSCK for namespace!"
950 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
951 [ "$STATUS" == "scanning-phase1" ] ||
952 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
954 #define OBD_FAIL_LFSCK_CRASH 0x160a
955 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
958 echo "stop $SINGLEMDS"
959 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
961 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
962 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
964 echo "start $SINGLEMDS"
965 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
967 local timeout=$(max_recovery_time)
970 while [ $timer -lt $timeout ]; do
971 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
972 mdt.${MDT_DEV}.recovery_status |
973 awk '/^status/ { print \\\$2 }'")
974 [ "$STATUS" != "RECOVERING" ] && break;
979 [ $timer != $timeout ] || (
980 do_facet $SINGLEMDS "$LCTL get_param -n \
981 mdt.${MDT_DEV}.recovery_status"
982 error "(14.1) recovery timeout"
985 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
986 [ "$STATUS" == "crashed" ] ||
987 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
989 #define OBD_FAIL_LFSCK_DELAY2 0x1601
990 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
993 [ -n "$($SHOW_NAMESPACE |
994 grep -E "status: init|status: completed")" ] && {
996 namespace_error "(16) Fail to start LFSCK for namespace!"
997 } || echo "lfsck for namespace has been started"
999 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1000 [ "$STATUS" == "scanning-phase1" ] ||
1001 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1003 echo "stop $SINGLEMDS"
1004 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1006 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1007 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1009 echo "start $SINGLEMDS"
1010 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1013 while [ $timer -lt $timeout ]; do
1014 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1015 mdt.${MDT_DEV}.recovery_status |
1016 awk '/^status/ { print \\\$2 }'")
1017 [ "$STATUS" != "RECOVERING" ] && break;
1019 timer=$((timer + 1))
1022 [ $timer != $timeout ] || (
1023 do_facet $SINGLEMDS "$LCTL get_param -n \
1024 mdt.${MDT_DEV}.recovery_status"
1025 error "(19.1) recovery timeout"
1028 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1029 [ "$STATUS" == "paused" ] ||
1030 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1032 echo "stop $SINGLEMDS"
1033 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1035 echo "start $SINGLEMDS without resume LFSCK"
1036 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1039 while [ $timer -lt $timeout ]; do
1040 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1041 mdt.${MDT_DEV}.recovery_status |
1042 awk '/^status/ { print \\\$2 }'")
1043 [ "$STATUS" != "RECOVERING" ] && break;
1045 timer=$((timer + 1))
1048 [ $timer != $timeout ] || (
1049 do_facet $SINGLEMDS "$LCTL get_param -n \
1050 mdt.${MDT_DEV}.recovery_status"
1051 error "(20.3) recovery timeout"
1054 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1055 [ "$STATUS" == "paused" ] ||
1056 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1058 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1059 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1062 namespace_error "(21) Fail to start LFSCK for namespace!"
1063 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1064 mdd.${MDT_DEV}.lfsck_namespace |
1065 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1067 namespace_error "(22) unexpected status"
1070 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1073 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1074 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1075 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1077 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1078 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1079 mdd.${MDT_DEV}.lfsck_namespace |
1080 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1082 namespace_error "(24) unexpected status"
1085 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1087 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1089 run_test 8 "LFSCK state machine"
1092 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1093 skip "Testing on UP system, the speed may be inaccurate."
1097 check_mount_and_prep
1098 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1099 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1100 createmany -o $DIR/$tdir/lfsck/f 5000
1102 local BASE_SPEED1=100
1104 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1107 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1108 [ "$STATUS" == "scanning-phase1" ] ||
1109 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1111 local SPEED=$($SHOW_LAYOUT |
1112 awk '/^average_speed_phase1/ { print $2 }')
1114 # There may be time error, normally it should be less than 2 seconds.
1115 # We allow another 20% schedule error.
1117 # MAX_MARGIN = 1.3 = 13 / 10
1118 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1119 RUN_TIME1 * 13 / 10))
1120 [ $SPEED -lt $MAX_SPEED ] || {
1122 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1123 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1126 # adjust speed limit
1127 local BASE_SPEED2=300
1129 do_facet $SINGLEMDS \
1130 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1133 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1134 # MIN_MARGIN = 0.7 = 7 / 10
1135 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1136 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1137 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1138 [ $SPEED -gt $MIN_SPEED ] || {
1139 if [ $mds1_FSTYPE != ldiskfs ]; then
1140 error_ignore LU-5624 \
1141 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1144 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1148 # MAX_MARGIN = 1.3 = 13 / 10
1149 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1150 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1151 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1152 [ $SPEED -lt $MAX_SPEED ] || {
1154 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1155 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1156 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1159 do_nodes $(comma_list $(mdts_nodes)) \
1160 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1161 do_nodes $(comma_list $(osts_nodes)) \
1162 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1164 wait_update_facet $SINGLEMDS \
1165 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1166 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1167 error "(7) Failed to get expected 'completed'"
1169 run_test 9a "LFSCK speed control (1)"
1172 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1173 skip "Testing on UP system, the speed may be inaccurate."
1179 echo "Preparing another 50 * 50 files (with error) at $(date)."
1180 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1181 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1182 createmany -d $DIR/$tdir/d 50
1183 createmany -m $DIR/$tdir/f 50
1184 for ((i = 0; i < 50; i++)); do
1185 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1188 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1189 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1190 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1191 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1192 mdd.${MDT_DEV}.lfsck_namespace |
1193 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1195 error "(5) unexpected status"
1198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1199 echo "Prepared at $(date)."
1201 local BASE_SPEED1=50
1203 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1206 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1207 [ "$STATUS" == "scanning-phase2" ] ||
1208 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1210 local SPEED=$($SHOW_NAMESPACE |
1211 awk '/^average_speed_phase2/ { print $2 }')
1212 # There may be time error, normally it should be less than 2 seconds.
1213 # We allow another 20% schedule error.
1215 # MAX_MARGIN = 1.3 = 13 / 10
1216 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1217 RUN_TIME1 * 13 / 10))
1218 [ $SPEED -lt $MAX_SPEED ] || {
1220 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1221 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1224 # adjust speed limit
1225 local BASE_SPEED2=150
1227 do_facet $SINGLEMDS \
1228 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1231 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1232 # MIN_MARGIN = 0.7 = 7 / 10
1233 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1234 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1235 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1236 [ $SPEED -gt $MIN_SPEED ] || {
1237 if [ $mds1_FSTYPE != ldiskfs ]; then
1238 error_ignore LU-5624 \
1239 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1242 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1246 # MAX_MARGIN = 1.3 = 13 / 10
1247 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1248 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1249 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1250 [ $SPEED -lt $MAX_SPEED ] || {
1252 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1253 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1254 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1257 do_nodes $(comma_list $(mdts_nodes)) \
1258 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1259 do_nodes $(comma_list $(osts_nodes)) \
1260 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1261 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1262 mdd.${MDT_DEV}.lfsck_namespace |
1263 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1265 error "(11) unexpected status"
1268 run_test 9b "LFSCK speed control (2)"
1272 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1276 echo "Preparing more files with error at $(date)."
1277 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1278 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1280 for ((i = 0; i < 1000; i = $((i+2)))); do
1281 mkdir -p $DIR/$tdir/d${i}
1282 touch $DIR/$tdir/f${i}
1283 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1286 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1287 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1289 for ((i = 1; i < 1000; i = $((i+2)))); do
1290 mkdir -p $DIR/$tdir/d${i}
1291 touch $DIR/$tdir/f${i}
1292 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1296 echo "Prepared at $(date)."
1298 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1300 umount_client $MOUNT
1301 mount_client $MOUNT || error "(3) Fail to start client!"
1303 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1306 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1307 [ "$STATUS" == "scanning-phase1" ] ||
1308 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1310 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1312 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1314 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1316 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1318 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1320 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1322 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1324 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1325 error "(14) Fail to softlink!"
1327 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1328 [ "$STATUS" == "scanning-phase1" ] ||
1329 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1331 do_nodes $(comma_list $(mdts_nodes)) \
1332 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1333 do_nodes $(comma_list $(osts_nodes)) \
1334 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1335 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1336 mdd.${MDT_DEV}.lfsck_namespace |
1337 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1339 error "(16) unexpected status"
1342 run_test 10 "System is available during LFSCK scanning"
1345 ost_remove_lastid() {
1348 local rcmd="do_facet ost${ost}"
1350 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1352 # step 1: local mount
1353 mount_fstype ost${ost} || return 1
1354 # step 2: remove the specified LAST_ID
1355 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1357 unmount_fstype ost${ost} || return 2
1361 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1362 skip "MDS older than 2.5.55, LU-1267"
1364 check_mount_and_prep
1365 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1366 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1371 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1373 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1374 error "(2) Fail to start ost1"
1376 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1377 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1379 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1380 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1382 wait_update_facet ost1 "$LCTL get_param -n \
1383 obdfilter.${OST_DEV}.lfsck_layout |
1384 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1386 error "(5) unexpected status"
1389 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1391 wait_update_facet ost1 "$LCTL get_param -n \
1392 obdfilter.${OST_DEV}.lfsck_layout |
1393 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1395 error "(6) unexpected status"
1398 echo "the LAST_ID(s) should have been rebuilt"
1399 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1400 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1402 run_test 11a "LFSCK can rebuild lost last_id"
1405 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1406 skip "MDS older than 2.5.55, LU-1267"
1408 check_mount_and_prep
1409 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1411 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1412 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1413 do_facet ost1 $LCTL set_param fail_loc=0x160d
1415 local count=$(precreated_ost_obj_count 0 0)
1417 createmany -o $DIR/$tdir/f $((count + 32))
1419 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1420 local seq=$(do_facet mds1 $LCTL get_param -n \
1421 osp.${proc_path}.prealloc_last_seq)
1422 local id_used=$(do_facet mds1 $LCTL get_param -n \
1423 osp.${proc_path}.prealloc_last_id)
1425 umount_client $MOUNT
1426 stop ost1 || error "(1) Fail to stop ost1"
1428 #define OBD_FAIL_OST_ENOSPC 0x215
1429 do_facet ost1 $LCTL set_param fail_loc=0x215
1431 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1432 error "(2) Fail to start ost1"
1434 for ((i = 0; i < 60; i++)); do
1435 id_ost1=$(do_facet ost1 \
1436 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1437 awk -F: "/$seq/ { print \$2 }")
1438 [ -n "$id_ost1" ] && break
1442 echo "the on-disk LAST_ID should be smaller than the expected one"
1443 [ $id_used -gt $id_ost1 ] ||
1444 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1446 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1447 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1449 wait_update_facet ost1 \
1450 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1451 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1453 error "(6) unexpected status"
1456 stop ost1 || error "(7) Fail to stop ost1"
1458 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1459 error "(8) Fail to start ost1"
1461 echo "the on-disk LAST_ID should have been rebuilt"
1462 # last_id may be larger than $id_used if objects were created/skipped
1463 wait_update_facet_cond ost1 \
1464 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1465 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1466 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1467 error "(9) expect last_id >= id_used $seq:$id_used"
1470 do_facet ost1 $LCTL set_param fail_loc=0
1471 stopall || error "(10) Fail to stopall"
1473 run_test 11b "LFSCK can rebuild crashed last_id"
1476 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1477 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1478 skip "MDS older than 2.5.55, LU-3950"
1480 check_mount_and_prep
1481 for k in $(seq $MDSCOUNT); do
1482 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1483 createmany -o $DIR/$tdir/${k}/f 100 ||
1484 error "(0) Fail to create 100 files."
1487 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1488 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1489 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1491 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1492 wait_all_targets namespace scanning-phase1 3
1494 echo "Stop namespace LFSCK on all targets by single lctl command."
1495 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1496 error "(4) Fail to stop LFSCK on all devices!"
1498 echo "All the LFSCK targets should be in 'stopped' status."
1499 wait_all_targets_blocked namespace stopped 5
1501 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1502 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1503 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1505 echo "All the LFSCK targets should be in 'completed' status."
1506 wait_all_targets_blocked namespace completed 7
1508 start_full_debug_logging
1510 echo "Start layout LFSCK on all targets by single command (-s 1)."
1511 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1512 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1514 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1515 wait_all_targets layout scanning-phase1 9
1517 echo "Stop layout LFSCK on all targets by single lctl command."
1518 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1519 error "(10) Fail to stop LFSCK on all devices!"
1521 echo "All the LFSCK targets should be in 'stopped' status."
1522 wait_all_targets_blocked layout stopped 11
1524 for k in $(seq $OSTCOUNT); do
1525 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1526 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1527 awk '/^status/ { print $2 }')
1528 [ "$STATUS" == "stopped" ] ||
1529 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1532 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1533 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1534 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1536 echo "All the LFSCK targets should be in 'completed' status."
1537 wait_all_targets_blocked layout completed 14
1539 stop_full_debug_logging
1541 run_test 12a "single command to trigger LFSCK on all devices"
1544 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1545 skip "MDS older than 2.5.55, LU-3950"
1547 check_mount_and_prep
1549 echo "Start LFSCK without '-M' specified."
1550 do_facet mds1 $LCTL lfsck_start -A -r ||
1551 error "(0) Fail to start LFSCK without '-M'"
1553 wait_all_targets_blocked namespace completed 1
1554 wait_all_targets_blocked layout completed 2
1556 local count=$(do_facet mds1 $LCTL dl |
1557 awk '{ print $3 }' | grep mdt | wc -l)
1558 if [ $count -gt 1 ]; then
1560 echo "Start layout LFSCK on the node with multipe targets,"
1561 echo "but not specify '-M'/'-A' option. Should get failure."
1563 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1564 error "(3) Start layout LFSCK should fail" || true
1567 run_test 12b "auto detect Lustre device"
1570 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1571 skip "MDS older than 2.5.55, LU-3593"
1574 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1575 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1576 echo "MDT-object FID."
1579 check_mount_and_prep
1581 echo "Inject failure stub to simulate bad lmm_oi"
1582 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1584 createmany -o $DIR/$tdir/f 1
1585 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1586 error "(0) Fail to create PFL $DIR/$tdir/f1"
1587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1589 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1590 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1593 mdd.${MDT_DEV}.lfsck_layout |
1594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1596 error "(2) unexpected status"
1599 local repaired=$($SHOW_LAYOUT |
1600 awk '/^repaired_others/ { print $2 }')
1601 [ $repaired -eq 2 ] ||
1602 error "(3) Fail to repair crashed lmm_oi: $repaired"
1604 run_test 13 "LFSCK can repair crashed lmm_oi"
1607 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1608 skip "MDS older than 2.5.55, LU-3590"
1611 echo "The OST-object referenced by the MDT-object should be there;"
1612 echo "otherwise, the LFSCK should re-create the missing OST-object."
1613 echo "without '--delay-create-ostobj' option."
1616 check_mount_and_prep
1617 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1619 echo "Inject failure stub to simulate dangling referenced MDT-object"
1620 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1621 do_facet ost1 $LCTL set_param fail_loc=0x1610
1622 local count=$(precreated_ost_obj_count 0 0)
1624 createmany -o $DIR/$tdir/f $((count + 16)) ||
1625 error "(0.1) Fail to create $DIR/$tdir/fx"
1626 touch $DIR/$tdir/guard0
1628 for ((i = 0; i < 16; i++)); do
1629 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1630 $DIR/$tdir/f_comp${i} ||
1631 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1633 touch $DIR/$tdir/guard1
1635 do_facet ost1 $LCTL set_param fail_loc=0
1637 start_full_debug_logging
1639 # exhaust other pre-created dangling cases
1640 count=$(precreated_ost_obj_count 0 0)
1641 createmany -o $DIR/$tdir/a $count ||
1642 error "(0.5) Fail to create $count files."
1644 echo "'ls' should fail because of dangling referenced MDT-object"
1645 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1647 echo "Trigger layout LFSCK to find out dangling reference"
1648 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1650 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1651 mdd.${MDT_DEV}.lfsck_layout |
1652 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1654 error "(3) unexpected status"
1657 local repaired=$($SHOW_LAYOUT |
1658 awk '/^repaired_dangling/ { print $2 }')
1659 [ $repaired -ge 32 ] ||
1660 error "(4) Fail to repair dangling reference: $repaired"
1662 echo "'stat' should fail because of not repair dangling by default"
1663 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1664 error "(5.1) stat should fail"
1665 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1666 error "(5.2) stat should fail"
1668 echo "Trigger layout LFSCK to repair dangling reference"
1669 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1671 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1672 mdd.${MDT_DEV}.lfsck_layout |
1673 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1675 error "(7) unexpected status"
1678 # There may be some async LFSCK updates in processing, wait for
1679 # a while until the target reparation has been done. LU-4970.
1681 echo "'stat' should success after layout LFSCK repairing"
1682 wait_update_facet client "stat $DIR/$tdir/guard0 |
1683 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1684 stat $DIR/$tdir/guard0
1686 error "(8.1) unexpected size"
1689 wait_update_facet client "stat $DIR/$tdir/guard1 |
1690 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1691 stat $DIR/$tdir/guard1
1693 error "(8.2) unexpected size"
1696 repaired=$($SHOW_LAYOUT |
1697 awk '/^repaired_dangling/ { print $2 }')
1698 [ $repaired -ge 32 ] ||
1699 error "(9) Fail to repair dangling reference: $repaired"
1701 stop_full_debug_logging
1703 echo "stopall to cleanup object cache"
1706 setupall > /dev/null
1708 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1711 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1712 skip "MDS older than 2.5.55, LU-3590"
1715 echo "The OST-object referenced by the MDT-object should be there;"
1716 echo "otherwise, the LFSCK should re-create the missing OST-object."
1717 echo "with '--delay-create-ostobj' option."
1720 check_mount_and_prep
1721 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1723 echo "Inject failure stub to simulate dangling referenced MDT-object"
1724 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1725 do_facet ost1 $LCTL set_param fail_loc=0x1610
1726 local count=$(precreated_ost_obj_count 0 0)
1728 createmany -o $DIR/$tdir/f $((count + 31))
1729 touch $DIR/$tdir/guard
1730 do_facet ost1 $LCTL set_param fail_loc=0
1732 start_full_debug_logging
1734 # exhaust other pre-created dangling cases
1735 count=$(precreated_ost_obj_count 0 0)
1736 createmany -o $DIR/$tdir/a $count ||
1737 error "(0) Fail to create $count files."
1739 echo "'ls' should fail because of dangling referenced MDT-object"
1740 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1742 echo "Trigger layout LFSCK to find out dangling reference"
1743 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1745 wait_all_targets_blocked layout completed 3
1747 local repaired=$($SHOW_LAYOUT |
1748 awk '/^repaired_dangling/ { print $2 }')
1749 [ $repaired -ge 32 ] ||
1750 error "(4) Fail to repair dangling reference: $repaired"
1752 echo "'stat' should fail because of not repair dangling by default"
1753 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1755 echo "Trigger layout LFSCK to repair dangling reference"
1756 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1758 wait_all_targets_blocked layout completed 7
1760 # There may be some async LFSCK updates in processing, wait for
1761 # a while until the target reparation has been done. LU-4970.
1763 echo "'stat' should success after layout LFSCK repairing"
1764 wait_update_facet client "stat $DIR/$tdir/guard |
1765 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1766 stat $DIR/$tdir/guard
1768 error "(8) unexpected size"
1771 repaired=$($SHOW_LAYOUT |
1772 awk '/^repaired_dangling/ { print $2 }')
1773 [ $repaired -ge 32 ] ||
1774 error "(9) Fail to repair dangling reference: $repaired"
1776 stop_full_debug_logging
1778 echo "stopall to cleanup object cache"
1781 setupall > /dev/null
1783 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1786 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1787 skip "MDS older than 2.5.55, LU-3591"
1790 echo "If the OST-object referenced by the MDT-object back points"
1791 echo "to some non-exist MDT-object, then the LFSCK should repair"
1792 echo "the OST-object to back point to the right MDT-object."
1795 check_mount_and_prep
1796 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1798 echo "Inject failure stub to make the OST-object to back point to"
1799 echo "non-exist MDT-object."
1800 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1802 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1803 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1804 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1806 error "(0) Fail to create PFL $DIR/$tdir/f1"
1807 # 'dd' will trigger punch RPC firstly on every OST-objects.
1808 # So even though some OST-object will not be write by 'dd',
1809 # as long as it is allocated (may be NOT allocated in pfl_3b)
1810 # its layout information will be set also.
1811 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1812 cancel_lru_locks osc
1813 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1815 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1816 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1818 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1819 mdd.${MDT_DEV}.lfsck_layout |
1820 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1822 error "(2) unexpected status"
1825 local repaired=$($SHOW_LAYOUT |
1826 awk '/^repaired_unmatched_pair/ { print $2 }')
1827 [ $repaired -ge 3 ] ||
1828 error "(3) Fail to repair unmatched pair: $repaired"
1830 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1833 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1834 skip "MDS older than 2.5.55, LU-3591"
1837 echo "If the OST-object referenced by the MDT-object back points"
1838 echo "to other MDT-object that doesn't recognize the OST-object,"
1839 echo "then the LFSCK should repair it to back point to the right"
1840 echo "MDT-object (the first one)."
1843 check_mount_and_prep
1844 mkdir -p $DIR/$tdir/0
1845 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1846 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1847 cancel_lru_locks osc
1849 echo "Inject failure stub to make the OST-object to back point to"
1850 echo "other MDT-object"
1853 [ $OSTCOUNT -ge 2 ] && stripes=2
1855 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1856 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1857 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1858 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1860 error "(0) Fail to create PFL $DIR/$tdir/f1"
1861 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1862 cancel_lru_locks osc
1863 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1865 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1866 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1868 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1869 mdd.${MDT_DEV}.lfsck_layout |
1870 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1872 error "(2) unexpected status"
1875 local repaired=$($SHOW_LAYOUT |
1876 awk '/^repaired_unmatched_pair/ { print $2 }')
1877 [ $repaired -eq 4 ] ||
1878 error "(3) Fail to repair unmatched pair: $repaired"
1880 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1883 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1884 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1885 skip "MDS newer than 2.7.55, LU-6475"
1886 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1887 skip "MDS older than 2.5.55, LU-3591"
1890 echo "According to current metadata migration implementation,"
1891 echo "before the old MDT-object is removed, both the new MDT-object"
1892 echo "and old MDT-object will reference the same LOV layout. Then if"
1893 echo "the layout LFSCK finds the new MDT-object by race, it will"
1894 echo "regard related OST-object(s) as multiple referenced case, and"
1895 echo "will try to create new OST-object(s) for the new MDT-object."
1896 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1897 echo "MDT-object before confirm the multiple referenced case."
1900 check_mount_and_prep
1901 $LFS mkdir -i 1 $DIR/$tdir/a1
1902 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1903 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1904 cancel_lru_locks osc
1906 echo "Inject failure stub on MDT1 to delay the migration"
1908 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1909 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1910 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1911 $LFS migrate -m 0 $DIR/$tdir/a1 &
1914 echo "Trigger layout LFSCK to race with the migration"
1915 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1917 wait_all_targets_blocked layout completed 2
1919 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1920 local repaired=$($SHOW_LAYOUT |
1921 awk '/^repaired_unmatched_pair/ { print $2 }')
1922 [ $repaired -eq 1 ] ||
1923 error "(3) Fail to repair unmatched pair: $repaired"
1925 repaired=$($SHOW_LAYOUT |
1926 awk '/^repaired_multiple_referenced/ { print $2 }')
1927 [ $repaired -eq 0 ] ||
1928 error "(4) Unexpectedly repaird multiple references: $repaired"
1930 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1933 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1935 check_mount_and_prep
1937 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1938 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1939 error "setdirstripe failed"
1941 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
1942 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
1944 echo "Migrate $DIR/$tdir to MDT1"
1945 $LFS migrate -m 1 $DIR/$tdir &
1949 # fail sub transactions on random MDTs, which may cause some file
1951 #define OBD_FAIL_OUT_EIO 0x1709
1952 for ((i = 0; i < $MDSCOUNT; i++)); do
1953 do_facet mds$i $LCTL set_param fail_loc=0x1709
1955 do_facet mds$i $LCTL set_param fail_loc=0
1960 # LFSCK can't fully fix migrating directories, and may leave some
1961 # files inaccessible, but it shouldn't cause crash
1962 $START_NAMESPACE -A -r ||
1963 error "Fail to start LFSCK for namespace"
1965 wait_all_targets_blocked namespace completed 1
1967 # resume migration may fail because some file may be inaccessible, but
1968 # it shouldn't cause crash
1969 $LFS migrate -m 1 $DIR/$tdir
1971 # rm $tdir to avoid cleanup failure in the end
1973 $LFS rm_entry $DIR/$tdir/*
1975 REFORMAT="yes" cleanup_and_setup_lustre
1977 run_test 15d "LFSCK don't crash upon dir migration failure"
1980 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1981 skip "MDS older than 2.5.55, LU-3594"
1984 echo "If the OST-object's owner information does not match the owner"
1985 echo "information stored in the MDT-object, then the LFSCK trust the"
1986 echo "MDT-object and update the OST-object's owner information."
1989 check_mount_and_prep
1990 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1991 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1992 cancel_lru_locks osc
1994 # created but no setattr or write to the file.
1996 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1997 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1999 echo "Inject failure stub to skip OST-object owner changing"
2000 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2002 chown 1.1 $DIR/$tdir/f0
2003 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2005 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2008 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2010 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2011 mdd.${MDT_DEV}.lfsck_layout |
2012 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2014 error "(2) unexpected status"
2017 local repaired=$($SHOW_LAYOUT |
2018 awk '/^repaired_inconsistent_owner/ { print $2 }')
2019 [ $repaired -eq 1 ] ||
2020 error "(3) Fail to repair inconsistent owner: $repaired"
2022 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2025 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2026 skip "MDS older than 2.5.55, LU-3594"
2029 echo "If more than one MDT-objects reference the same OST-object,"
2030 echo "and the OST-object only recognizes one MDT-object, then the"
2031 echo "LFSCK should create new OST-objects for such non-recognized"
2035 check_mount_and_prep
2036 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2038 echo "Inject failure stub to make two MDT-objects to refernce"
2039 echo "the OST-object"
2041 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2042 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2043 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2044 cancel_lru_locks mdc
2045 cancel_lru_locks osc
2047 createmany -o $DIR/$tdir/f 1
2048 cancel_lru_locks mdc
2049 cancel_lru_locks osc
2051 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2053 error "(0) Fail to create PFL $DIR/$tdir/f1"
2054 cancel_lru_locks mdc
2055 cancel_lru_locks osc
2056 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2058 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2059 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2060 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2061 [ $size -eq 1048576 ] ||
2062 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2064 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2065 [ $size -eq 1048576 ] ||
2066 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2068 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2071 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2073 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2074 mdd.${MDT_DEV}.lfsck_layout |
2075 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2077 error "(3) unexpected status"
2080 local repaired=$($SHOW_LAYOUT |
2081 awk '/^repaired_multiple_referenced/ { print $2 }')
2082 [ $repaired -eq 2 ] ||
2083 error "(4) Fail to repair multiple references: $repaired"
2085 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2086 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2087 error "(5) Fail to write f0."
2088 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2089 [ $size -eq 1048576 ] ||
2090 error "(6) guard size should be 1048576, but got $size"
2092 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2093 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2094 error "(7) Fail to write f1."
2095 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2096 [ $size -eq 1048576 ] ||
2097 error "(8) guard size should be 1048576, but got $size"
2099 run_test 17 "LFSCK can repair multiple references"
2101 $LCTL set_param debug=+cache > /dev/null
2104 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2105 skip "MDS older than 2.5.55, LU-3336"
2108 echo "The target MDT-object is there, but related stripe information"
2109 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2110 echo "layout EA entries."
2113 check_mount_and_prep
2114 $LFS mkdir -i 0 $DIR/$tdir/a1
2115 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2116 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2118 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2120 $LFS path2fid $DIR/$tdir/a1/f1
2121 $LFS getstripe $DIR/$tdir/a1/f1
2123 if [ $MDSCOUNT -ge 2 ]; then
2124 $LFS mkdir -i 1 $DIR/$tdir/a2
2125 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2126 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2127 $LFS path2fid $DIR/$tdir/a2/f2
2128 $LFS getstripe $DIR/$tdir/a2/f2
2131 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2132 error "(0) Fail to create PFL $DIR/$tdir/f3"
2134 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2136 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2138 $LFS path2fid $DIR/$tdir/f3
2139 $LFS getstripe $DIR/$tdir/f3
2141 cancel_lru_locks osc
2143 echo "Inject failure, to make the MDT-object lost its layout EA"
2144 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2145 do_facet mds1 $LCTL set_param fail_loc=0x1615
2146 chown 1.1 $DIR/$tdir/a1/f1
2148 if [ $MDSCOUNT -ge 2 ]; then
2149 do_facet mds2 $LCTL set_param fail_loc=0x1615
2150 chown 1.1 $DIR/$tdir/a2/f2
2153 chown 1.1 $DIR/$tdir/f3
2158 do_facet mds1 $LCTL set_param fail_loc=0
2159 if [ $MDSCOUNT -ge 2 ]; then
2160 do_facet mds2 $LCTL set_param fail_loc=0
2163 cancel_lru_locks mdc
2164 cancel_lru_locks osc
2166 echo "The file size should be incorrect since layout EA is lost"
2167 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2168 [ "$cur_size" != "$saved_size1" ] ||
2169 error "(1) Expect incorrect file1 size"
2171 if [ $MDSCOUNT -ge 2 ]; then
2172 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2173 [ "$cur_size" != "$saved_size1" ] ||
2174 error "(2) Expect incorrect file2 size"
2177 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2178 [ "$cur_size" != "$saved_size2" ] ||
2179 error "(1.2) Expect incorrect file3 size"
2181 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2182 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2184 for k in $(seq $MDSCOUNT); do
2185 # The LFSCK status query internal is 30 seconds. For the case
2186 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2187 # time to guarantee the status sync up.
2188 wait_update_facet mds${k} "$LCTL get_param -n \
2189 mdd.$(facet_svc mds${k}).lfsck_layout |
2190 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2191 error "(4) MDS${k} is not the expected 'completed'"
2194 for k in $(seq $OSTCOUNT); do
2195 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2196 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2197 awk '/^status/ { print $2 }')
2198 [ "$cur_status" == "completed" ] ||
2199 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2202 local repaired=$(do_facet mds1 $LCTL get_param -n \
2203 mdd.$(facet_svc mds1).lfsck_layout |
2204 awk '/^repaired_orphan/ { print $2 }')
2205 [ $repaired -eq 3 ] ||
2206 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2208 if [ $MDSCOUNT -ge 2 ]; then
2209 repaired=$(do_facet mds2 $LCTL get_param -n \
2210 mdd.$(facet_svc mds2).lfsck_layout |
2211 awk '/^repaired_orphan/ { print $2 }')
2212 [ $repaired -eq 2 ] ||
2213 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2216 $LFS path2fid $DIR/$tdir/a1/f1
2217 $LFS getstripe $DIR/$tdir/a1/f1
2219 if [ $MDSCOUNT -ge 2 ]; then
2220 $LFS path2fid $DIR/$tdir/a2/f2
2221 $LFS getstripe $DIR/$tdir/a2/f2
2224 $LFS path2fid $DIR/$tdir/f3
2225 $LFS getstripe $DIR/$tdir/f3
2227 echo "The file size should be correct after layout LFSCK scanning"
2228 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2229 [ "$cur_size" == "$saved_size1" ] ||
2230 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2232 if [ $MDSCOUNT -ge 2 ]; then
2233 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2234 [ "$cur_size" == "$saved_size1" ] ||
2235 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2238 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2239 [ "$cur_size" == "$saved_size2" ] ||
2240 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2242 run_test 18a "Find out orphan OST-object and repair it (1)"
2245 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2246 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2247 skip "MDS older than 2.5.55, LU-3336"
2250 echo "The target MDT-object is lost. The LFSCK should re-create the"
2251 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2252 echo "can move it back to normal namespace manually."
2255 check_mount_and_prep
2256 $LFS mkdir -i 0 $DIR/$tdir/a1
2257 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2258 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2259 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2260 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2262 $LFS getstripe $DIR/$tdir/a1/f1
2264 if [ $MDSCOUNT -ge 2 ]; then
2265 $LFS mkdir -i 1 $DIR/$tdir/a2
2266 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2267 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2268 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2270 $LFS getstripe $DIR/$tdir/a2/f2
2273 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2274 error "(0) Fail to create PFL $DIR/$tdir/f3"
2276 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2278 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2279 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2281 $LFS getstripe $DIR/$tdir/f3
2283 cancel_lru_locks osc
2285 echo "Inject failure, to simulate the case of missing the MDT-object"
2286 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2287 do_facet mds1 $LCTL set_param fail_loc=0x1616
2288 rm -f $DIR/$tdir/a1/f1
2290 if [ $MDSCOUNT -ge 2 ]; then
2291 do_facet mds2 $LCTL set_param fail_loc=0x1616
2292 rm -f $DIR/$tdir/a2/f2
2300 do_facet mds1 $LCTL set_param fail_loc=0
2301 if [ $MDSCOUNT -ge 2 ]; then
2302 do_facet mds2 $LCTL set_param fail_loc=0
2305 cancel_lru_locks mdc
2306 cancel_lru_locks osc
2308 # dryrun mode only check orphans, not repaie
2309 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2310 $START_LAYOUT --dryrun -o -r ||
2311 error "Fail to start layout LFSCK in dryrun mode"
2312 wait_all_targets_blocked layout completed 2
2314 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2315 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2316 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2318 local orphans=$(do_facet mds1 $LCTL get_param -n \
2319 mdd.$(facet_svc mds1).lfsck_layout |
2320 awk '/^inconsistent_orphan/ { print $2 }')
2321 [ $orphans -eq 3 ] ||
2322 error "Expect 3 found on mds1, but got: $orphans"
2324 # orphan parents should not be created
2326 for subdir in $MOUNT/.lustre/lost+found/*; do
2327 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2330 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2331 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2333 for k in $(seq $MDSCOUNT); do
2334 # The LFSCK status query internal is 30 seconds. For the case
2335 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2336 # time to guarantee the status sync up.
2337 wait_update_facet mds${k} "$LCTL get_param -n \
2338 mdd.$(facet_svc mds${k}).lfsck_layout |
2339 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2340 error "(2) MDS${k} is not the expected 'completed'"
2343 for k in $(seq $OSTCOUNT); do
2344 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2345 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2346 awk '/^status/ { print $2 }')
2347 [ "$cur_status" == "completed" ] ||
2348 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2351 local repaired=$(do_facet mds1 $LCTL get_param -n \
2352 mdd.$(facet_svc mds1).lfsck_layout |
2353 awk '/^repaired_orphan/ { print $2 }')
2354 [ $repaired -eq 3 ] ||
2355 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2357 if [ $MDSCOUNT -ge 2 ]; then
2358 repaired=$(do_facet mds2 $LCTL get_param -n \
2359 mdd.$(facet_svc mds2).lfsck_layout |
2360 awk '/^repaired_orphan/ { print $2 }')
2361 [ $repaired -eq 2 ] ||
2362 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2365 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2366 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2367 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2369 if [ $MDSCOUNT -ge 2 ]; then
2370 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2371 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2374 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2375 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2377 $LFS path2fid $DIR/$tdir/a1/f1
2378 $LFS getstripe $DIR/$tdir/a1/f1
2380 if [ $MDSCOUNT -ge 2 ]; then
2381 $LFS path2fid $DIR/$tdir/a2/f2
2382 $LFS getstripe $DIR/$tdir/a2/f2
2385 $LFS path2fid $DIR/$tdir/f3
2386 $LFS getstripe $DIR/$tdir/f3
2388 echo "The file size should be correct after layout LFSCK scanning"
2389 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2390 [ "$cur_size" == "$saved_size1" ] ||
2391 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2393 if [ $MDSCOUNT -ge 2 ]; then
2394 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2395 [ "$cur_size" == "$saved_size1" ] ||
2396 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2399 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2400 [ "$cur_size" == "$saved_size2" ] ||
2401 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2403 run_test 18b "Find out orphan OST-object and repair it (2)"
2406 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2407 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2408 skip "MDS older than 2.5.55, LU-3336"
2411 echo "The target MDT-object is lost, and the OST-object FID is missing."
2412 echo "The LFSCK should re-create the MDT-object with new FID under the "
2413 echo "directory .lustre/lost+found/MDTxxxx."
2416 check_mount_and_prep
2417 $LFS mkdir -i 0 $DIR/$tdir/a1
2418 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2420 echo "Inject failure, to simulate the case of missing parent FID"
2421 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2422 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2424 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2425 $LFS getstripe $DIR/$tdir/a1/f1
2427 if [ $MDSCOUNT -ge 2 ]; then
2428 $LFS mkdir -i 1 $DIR/$tdir/a2
2429 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2430 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2431 $LFS getstripe $DIR/$tdir/a2/f2
2434 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2435 error "(0) Fail to create PFL $DIR/$tdir/f3"
2437 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2438 $LFS getstripe $DIR/$tdir/f3
2440 cancel_lru_locks osc
2441 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2443 echo "Inject failure, to simulate the case of missing the MDT-object"
2444 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2445 do_facet mds1 $LCTL set_param fail_loc=0x1616
2446 rm -f $DIR/$tdir/a1/f1
2448 if [ $MDSCOUNT -ge 2 ]; then
2449 do_facet mds2 $LCTL set_param fail_loc=0x1616
2450 rm -f $DIR/$tdir/a2/f2
2458 do_facet mds1 $LCTL set_param fail_loc=0
2459 if [ $MDSCOUNT -ge 2 ]; then
2460 do_facet mds2 $LCTL set_param fail_loc=0
2463 cancel_lru_locks mdc
2464 cancel_lru_locks osc
2466 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2467 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2469 for k in $(seq $MDSCOUNT); do
2470 # The LFSCK status query internal is 30 seconds. For the case
2471 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2472 # time to guarantee the status sync up.
2473 wait_update_facet mds${k} "$LCTL get_param -n \
2474 mdd.$(facet_svc mds${k}).lfsck_layout |
2475 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2476 error "(2) MDS${k} is not the expected 'completed'"
2479 for k in $(seq $OSTCOUNT); do
2480 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2481 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2482 awk '/^status/ { print $2 }')
2483 [ "$cur_status" == "completed" ] ||
2484 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2487 if [ $MDSCOUNT -ge 2 ]; then
2493 local repaired=$(do_facet mds1 $LCTL get_param -n \
2494 mdd.$(facet_svc mds1).lfsck_layout |
2495 awk '/^repaired_orphan/ { print $2 }')
2496 [ $repaired -eq $expected ] ||
2497 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2499 if [ $MDSCOUNT -ge 2 ]; then
2500 repaired=$(do_facet mds2 $LCTL get_param -n \
2501 mdd.$(facet_svc mds2).lfsck_layout |
2502 awk '/^repaired_orphan/ { print $2 }')
2503 [ $repaired -eq 0 ] ||
2504 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2507 ls -ail $MOUNT/.lustre/lost+found/
2509 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2510 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2511 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2513 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2516 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2517 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2518 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2520 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2521 [ ! -z "$cname" ] ||
2522 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2524 run_test 18c "Find out orphan OST-object and repair it (3)"
2527 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2528 skip "MDS older than 2.5.55, LU-3336"
2531 echo "The target MDT-object layout EA is corrupted, but the right"
2532 echo "OST-object is still alive as orphan. The layout LFSCK will"
2533 echo "not create new OST-object to occupy such slot."
2536 check_mount_and_prep
2538 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2539 echo "guard" > $DIR/$tdir/a1/f1
2540 echo "foo" > $DIR/$tdir/a1/f2
2542 echo "guard" > $DIR/$tdir/a1/f3
2543 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2544 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2545 echo "foo" > $DIR/$tdir/a1/f4
2547 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2548 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2549 $LFS path2fid $DIR/$tdir/a1/f1
2550 $LFS getstripe $DIR/$tdir/a1/f1
2551 $LFS path2fid $DIR/$tdir/a1/f2
2552 $LFS getstripe $DIR/$tdir/a1/f2
2553 $LFS path2fid $DIR/$tdir/a1/f3
2554 $LFS getstripe $DIR/$tdir/a1/f3
2555 $LFS path2fid $DIR/$tdir/a1/f4
2556 $LFS getstripe $DIR/$tdir/a1/f4
2557 cancel_lru_locks osc
2559 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2560 echo "to reference the same OST-object (which is f1's OST-obejct)."
2561 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2562 echo "dangling reference case, but f2's old OST-object is there."
2564 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2565 echo "to reference the same OST-object (which is f3's OST-obejct)."
2566 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2567 echo "dangling reference case, but f4's old OST-object is there."
2570 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2571 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2572 chown 1.1 $DIR/$tdir/a1/f2
2573 chown 1.1 $DIR/$tdir/a1/f4
2574 rm -f $DIR/$tdir/a1/f1
2575 rm -f $DIR/$tdir/a1/f3
2578 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2580 echo "stopall to cleanup object cache"
2583 setupall > /dev/null
2585 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2586 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2588 for k in $(seq $MDSCOUNT); do
2589 # The LFSCK status query internal is 30 seconds. For the case
2590 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2591 # time to guarantee the status sync up.
2592 wait_update_facet mds${k} "$LCTL get_param -n \
2593 mdd.$(facet_svc mds${k}).lfsck_layout |
2594 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2595 error "(3) MDS${k} is not the expected 'completed'"
2598 for k in $(seq $OSTCOUNT); do
2599 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2600 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2601 awk '/^status/ { print $2 }')
2602 [ "$cur_status" == "completed" ] ||
2603 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2606 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2607 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2608 awk '/^repaired_orphan/ { print $2 }')
2609 [ $repaired -eq 2 ] ||
2610 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2612 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2613 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2614 awk '/^repaired_dangling/ { print $2 }')
2615 [ $repaired -eq 0 ] ||
2616 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2618 echo "The file size should be correct after layout LFSCK scanning"
2619 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2620 [ "$cur_size" == "$saved_size1" ] ||
2621 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2623 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2624 [ "$cur_size" == "$saved_size2" ] ||
2625 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2627 echo "The LFSCK should find back the original data."
2628 cat $DIR/$tdir/a1/f2
2629 $LFS path2fid $DIR/$tdir/a1/f2
2630 $LFS getstripe $DIR/$tdir/a1/f2
2631 cat $DIR/$tdir/a1/f4
2632 $LFS path2fid $DIR/$tdir/a1/f4
2633 $LFS getstripe $DIR/$tdir/a1/f4
2635 run_test 18d "Find out orphan OST-object and repair it (4)"
2638 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2639 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2640 skip "MDS older than 2.5.55, LU-3336"
2643 echo "The target MDT-object layout EA slot is occpuied by some new"
2644 echo "created OST-object when repair dangling reference case. Such"
2645 echo "conflict OST-object has been modified by others. To keep the"
2646 echo "new data, the LFSCK will create a new file to refernece this"
2647 echo "old orphan OST-object."
2650 check_mount_and_prep
2652 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2653 echo "guard" > $DIR/$tdir/a1/f1
2654 echo "foo" > $DIR/$tdir/a1/f2
2656 echo "guard" > $DIR/$tdir/a1/f3
2657 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2658 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2659 echo "foo" > $DIR/$tdir/a1/f4
2661 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2662 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2664 $LFS path2fid $DIR/$tdir/a1/f1
2665 $LFS getstripe $DIR/$tdir/a1/f1
2666 $LFS path2fid $DIR/$tdir/a1/f2
2667 $LFS getstripe $DIR/$tdir/a1/f2
2668 $LFS path2fid $DIR/$tdir/a1/f3
2669 $LFS getstripe $DIR/$tdir/a1/f3
2670 $LFS path2fid $DIR/$tdir/a1/f4
2671 $LFS getstripe $DIR/$tdir/a1/f4
2672 cancel_lru_locks osc
2674 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2675 echo "to reference the same OST-object (which is f1's OST-obejct)."
2676 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2677 echo "dangling reference case, but f2's old OST-object is there."
2679 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2680 echo "to reference the same OST-object (which is f3's OST-obejct)."
2681 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2682 echo "dangling reference case, but f4's old OST-object is there."
2685 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2687 chown 1.1 $DIR/$tdir/a1/f2
2688 chown 1.1 $DIR/$tdir/a1/f4
2689 rm -f $DIR/$tdir/a1/f1
2690 rm -f $DIR/$tdir/a1/f3
2693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2695 echo "stopall to cleanup object cache"
2698 setupall > /dev/null
2700 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2701 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2703 start_full_debug_logging
2705 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2706 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2708 wait_update_facet mds1 "$LCTL get_param -n \
2709 mdd.$(facet_svc mds1).lfsck_layout |
2710 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2711 error "(3) MDS1 is not the expected 'scanning-phase2'"
2713 # to guarantee all updates are synced.
2717 echo "Write new data to f2/f4 to modify the new created OST-object."
2718 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2719 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2721 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2723 for k in $(seq $MDSCOUNT); do
2724 # The LFSCK status query internal is 30 seconds. For the case
2725 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2726 # time to guarantee the status sync up.
2727 wait_update_facet mds${k} "$LCTL get_param -n \
2728 mdd.$(facet_svc mds${k}).lfsck_layout |
2729 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2730 error "(4) MDS${k} is not the expected 'completed'"
2733 for k in $(seq $OSTCOUNT); do
2734 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2735 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2736 awk '/^status/ { print $2 }')
2737 [ "$cur_status" == "completed" ] ||
2738 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2741 stop_full_debug_logging
2743 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2744 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2745 awk '/^repaired_orphan/ { print $2 }')
2746 [ $repaired -eq 2 ] ||
2747 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2749 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2750 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2751 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2753 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2754 if [ $count -ne 2 ]; then
2755 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2756 error "(8) Expect 2 stubs under lost+found, but got $count"
2759 echo "The stub file should keep the original f2 or f4 data"
2760 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2761 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2762 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2763 error "(9) Got unexpected $cur_size"
2766 $LFS path2fid $cname
2767 $LFS getstripe $cname
2769 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2770 cur_size=$(ls -il $cname | awk '{ print $6 }')
2771 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2772 error "(10) Got unexpected $cur_size"
2775 $LFS path2fid $cname
2776 $LFS getstripe $cname
2778 echo "The f2/f4 should contains new data."
2779 cat $DIR/$tdir/a1/f2
2780 $LFS path2fid $DIR/$tdir/a1/f2
2781 $LFS getstripe $DIR/$tdir/a1/f2
2782 cat $DIR/$tdir/a1/f4
2783 $LFS path2fid $DIR/$tdir/a1/f4
2784 $LFS getstripe $DIR/$tdir/a1/f4
2786 run_test 18e "Find out orphan OST-object and repair it (5)"
2789 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2792 echo "The target MDT-object is lost. The LFSCK should re-create the"
2793 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2794 echo "to verify some OST-object(s) during the first stage-scanning,"
2795 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2796 echo "should not be affected."
2799 check_mount_and_prep
2800 $LFS mkdir -i 0 $DIR/$tdir/a1
2801 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2802 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2803 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2804 $LFS mkdir -i 0 $DIR/$tdir/a2
2805 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2806 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2807 $LFS getstripe $DIR/$tdir/a1/f1
2808 $LFS getstripe $DIR/$tdir/a2/f2
2810 if [ $MDSCOUNT -ge 2 ]; then
2811 $LFS mkdir -i 1 $DIR/$tdir/a3
2812 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2813 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2814 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2815 $LFS mkdir -i 1 $DIR/$tdir/a4
2816 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2817 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2818 $LFS getstripe $DIR/$tdir/a3/f3
2819 $LFS getstripe $DIR/$tdir/a4/f4
2822 cancel_lru_locks osc
2824 echo "Inject failure, to simulate the case of missing the MDT-object"
2825 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2826 do_facet mds1 $LCTL set_param fail_loc=0x1616
2827 rm -f $DIR/$tdir/a1/f1
2828 rm -f $DIR/$tdir/a2/f2
2830 if [ $MDSCOUNT -ge 2 ]; then
2831 do_facet mds2 $LCTL set_param fail_loc=0x1616
2832 rm -f $DIR/$tdir/a3/f3
2833 rm -f $DIR/$tdir/a4/f4
2839 do_facet mds1 $LCTL set_param fail_loc=0
2840 if [ $MDSCOUNT -ge 2 ]; then
2841 do_facet mds2 $LCTL set_param fail_loc=0
2844 cancel_lru_locks mdc
2845 cancel_lru_locks osc
2847 echo "Inject failure, to simulate the OST0 fail to handle"
2848 echo "MDT0 LFSCK request during the first-stage scanning."
2849 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2850 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2852 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2853 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2855 for k in $(seq $MDSCOUNT); do
2856 # The LFSCK status query internal is 30 seconds. For the case
2857 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2858 # time to guarantee the status sync up.
2859 wait_update_facet mds${k} "$LCTL get_param -n \
2860 mdd.$(facet_svc mds${k}).lfsck_layout |
2861 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2862 error "(2) MDS${k} is not the expected 'partial'"
2865 wait_update_facet ost1 "$LCTL get_param -n \
2866 obdfilter.$(facet_svc ost1).lfsck_layout |
2867 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2868 error "(3) OST1 is not the expected 'partial'"
2871 wait_update_facet ost2 "$LCTL get_param -n \
2872 obdfilter.$(facet_svc ost2).lfsck_layout |
2873 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2874 error "(4) OST2 is not the expected 'completed'"
2877 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2879 local repaired=$(do_facet mds1 $LCTL get_param -n \
2880 mdd.$(facet_svc mds1).lfsck_layout |
2881 awk '/^repaired_orphan/ { print $2 }')
2882 [ $repaired -eq 1 ] ||
2883 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2885 if [ $MDSCOUNT -ge 2 ]; then
2886 repaired=$(do_facet mds2 $LCTL get_param -n \
2887 mdd.$(facet_svc mds2).lfsck_layout |
2888 awk '/^repaired_orphan/ { print $2 }')
2889 [ $repaired -eq 1 ] ||
2890 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2893 echo "Trigger layout LFSCK on all devices again to cleanup"
2894 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2896 for k in $(seq $MDSCOUNT); do
2897 # The LFSCK status query internal is 30 seconds. For the case
2898 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2899 # time to guarantee the status sync up.
2900 wait_update_facet mds${k} "$LCTL get_param -n \
2901 mdd.$(facet_svc mds${k}).lfsck_layout |
2902 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2903 error "(8) MDS${k} is not the expected 'completed'"
2906 for k in $(seq $OSTCOUNT); do
2907 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2908 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2909 awk '/^status/ { print $2 }')
2910 [ "$cur_status" == "completed" ] ||
2911 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2915 local repaired=$(do_facet mds1 $LCTL get_param -n \
2916 mdd.$(facet_svc mds1).lfsck_layout |
2917 awk '/^repaired_orphan/ { print $2 }')
2918 [ $repaired -eq 2 ] ||
2919 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2921 if [ $MDSCOUNT -ge 2 ]; then
2922 repaired=$(do_facet mds2 $LCTL get_param -n \
2923 mdd.$(facet_svc mds2).lfsck_layout |
2924 awk '/^repaired_orphan/ { print $2 }')
2925 [ $repaired -eq 2 ] ||
2926 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2929 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2932 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2935 echo "The target MDT-object is lost, but related OI mapping is there"
2936 echo "The LFSCK should recreate the lost MDT-object without affected"
2937 echo "by the stale OI mapping."
2940 check_mount_and_prep
2941 $LFS mkdir -i 0 $DIR/$tdir/a1
2942 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2943 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2944 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2946 $LFS getstripe $DIR/$tdir/a1/f1
2947 cancel_lru_locks osc
2949 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2950 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2951 do_facet mds1 $LCTL set_param fail_loc=0x162e
2952 rm -f $DIR/$tdir/a1/f1
2954 do_facet mds1 $LCTL set_param fail_loc=0
2955 cancel_lru_locks mdc
2956 cancel_lru_locks osc
2958 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2959 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2961 for k in $(seq $MDSCOUNT); do
2962 # The LFSCK status query internal is 30 seconds. For the case
2963 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2964 # time to guarantee the status sync up.
2965 wait_update_facet mds${k} "$LCTL get_param -n \
2966 mdd.$(facet_svc mds${k}).lfsck_layout |
2967 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2968 error "(2) MDS${k} is not the expected 'completed'"
2971 for k in $(seq $OSTCOUNT); do
2972 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2973 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2974 awk '/^status/ { print $2 }')
2975 [ "$cur_status" == "completed" ] ||
2976 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2979 local repaired=$(do_facet mds1 $LCTL get_param -n \
2980 mdd.$(facet_svc mds1).lfsck_layout |
2981 awk '/^repaired_orphan/ { print $2 }')
2982 [ $repaired -eq $OSTCOUNT ] ||
2983 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2985 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2986 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2987 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2989 $LFS path2fid $DIR/$tdir/a1/f1
2990 $LFS getstripe $DIR/$tdir/a1/f1
2992 run_test 18g "Find out orphan OST-object and repair it (7)"
2996 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2997 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2998 echo "scanning its OST-object(s). Then in the second stage scanning,"
2999 echo "the OST will return related OST-object(s) to the MDT as orphan."
3000 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3001 echo "the 'orphan(s)' stripe information."
3004 check_mount_and_prep
3006 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3007 error "(0) Fail to create PFL $DIR/$tdir/f0"
3009 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3010 error "(1.1) Fail to write $DIR/$tdir/f0"
3012 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3013 error "(1.2) Fail to write $DIR/$tdir/f0"
3015 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3017 echo "Inject failure stub to simulate bad PFL extent range"
3018 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3021 chown 1.1 $DIR/$tdir/f0
3023 cancel_lru_locks mdc
3024 cancel_lru_locks osc
3025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3027 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3028 error "(2) Write to bad PFL file should fail"
3030 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3031 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3033 for k in $(seq $MDSCOUNT); do
3034 # The LFSCK status query internal is 30 seconds. For the case
3035 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3036 # time to guarantee the status sync up.
3037 wait_update_facet mds${k} "$LCTL get_param -n \
3038 mdd.$(facet_svc mds${k}).lfsck_layout |
3039 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3040 error "(4.1) MDS${k} is not the expected 'completed'"
3043 for k in $(seq $OSTCOUNT); do
3044 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3045 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3046 awk '/^status/ { print $2 }')
3047 [ "$cur_status" == "completed" ] ||
3048 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3052 local repaired=$($SHOW_LAYOUT |
3053 awk '/^repaired_orphan/ { print $2 }')
3054 [ $repaired -eq 2 ] ||
3055 error "(5) Fail to repair crashed PFL range: $repaired"
3057 echo "Data in $DIR/$tdir/f0 should not be broken"
3058 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3059 error "(6) Data in $DIR/$tdir/f0 is broken"
3061 echo "Write should succeed after LFSCK repairing the bad PFL range"
3062 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3063 error "(7) Write should succeed after LFSCK"
3065 run_test 18h "LFSCK can repair crashed PFL extent range"
3067 $LCTL set_param debug=-cache > /dev/null
3070 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3071 skip "MDS older than 2.5.55, LU-3951"
3073 check_mount_and_prep
3074 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3076 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3077 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3079 echo "foo1" > $DIR/$tdir/a0
3080 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3081 error "(0) Fail to create PFL $DIR/$tdir/a1"
3082 echo "foo2" > $DIR/$tdir/a1
3083 echo "guard" > $DIR/$tdir/a2
3084 cancel_lru_locks osc
3086 echo "Inject failure, then client will offer wrong parent FID when read"
3087 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3088 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3090 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3091 $LCTL set_param fail_loc=0x1619
3093 echo "Read RPC with wrong parent FID should be denied"
3094 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3095 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3096 $LCTL set_param fail_loc=0
3098 run_test 19a "OST-object inconsistency self detect"
3101 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3102 skip "MDS older than 2.5.55, LU-3951"
3104 check_mount_and_prep
3105 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3107 echo "Inject failure stub to make the OST-object to back point to"
3108 echo "non-exist MDT-object"
3110 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3111 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3113 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3114 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3115 echo "foo1" > $DIR/$tdir/f0
3116 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3117 error "(0) Fail to create PFL $DIR/$tdir/f1"
3118 echo "foo2" > $DIR/$tdir/f1
3119 cancel_lru_locks osc
3120 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3122 do_facet ost1 $LCTL set_param -n \
3123 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3124 echo "Nothing should be fixed since self detect and repair is disabled"
3125 local repaired=$(do_facet ost1 $LCTL get_param -n \
3126 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3127 awk '/^repaired/ { print $2 }')
3128 [ $repaired -eq 0 ] ||
3129 error "(1) Expected 0 repaired, but got $repaired"
3131 echo "Read RPC with right parent FID should be accepted,"
3132 echo "and cause parent FID on OST to be fixed"
3134 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3135 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3137 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3138 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3140 repaired=$(do_facet ost1 $LCTL get_param -n \
3141 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3142 awk '/^repaired/ { print $2 }')
3143 [ $repaired -eq 2 ] ||
3144 error "(3) Expected 1 repaired, but got $repaired"
3146 run_test 19b "OST-object inconsistency self repair"
3148 PATTERN_WITH_HOLE="40000001"
3149 PATTERN_WITHOUT_HOLE="raid0"
3152 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3153 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3154 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3155 skip "MDS older than 2.5.55, LU-4887"
3158 echo "The target MDT-object and some of its OST-object are lost."
3159 echo "The LFSCK should find out the left OST-objects and re-create"
3160 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3161 echo "with the partial OST-objects (LOV EA hole)."
3163 echo "New client can access the file with LOV EA hole via normal"
3164 echo "system tools or commands without crash the system."
3166 echo "For old client, even though it cannot access the file with"
3167 echo "LOV EA hole, it should not cause the system crash."
3170 check_mount_and_prep
3171 $LFS mkdir -i 0 $DIR/$tdir/a1
3172 if [ $OSTCOUNT -gt 2 ]; then
3173 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3176 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3180 # 256 blocks on the stripe0.
3181 # 1 block on the stripe1 for 2 OSTs case.
3182 # 256 blocks on the stripe1 for other cases.
3183 # 1 block on the stripe2 if OSTs > 2
3184 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3185 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3186 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3188 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3189 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3190 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3193 $LFS getstripe $DIR/$tdir/a1/f0
3195 $LFS getstripe $DIR/$tdir/a1/f1
3197 $LFS getstripe $DIR/$tdir/a1/f2
3199 if [ $OSTCOUNT -gt 2 ]; then
3200 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3201 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3203 $LFS getstripe $DIR/$tdir/a1/f3
3206 cancel_lru_locks osc
3208 echo "Inject failure..."
3209 echo "To simulate f0 lost MDT-object"
3210 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3211 do_facet mds1 $LCTL set_param fail_loc=0x1616
3212 rm -f $DIR/$tdir/a1/f0
3214 echo "To simulate f1 lost MDT-object and OST-object0"
3215 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3216 do_facet mds1 $LCTL set_param fail_loc=0x161a
3217 rm -f $DIR/$tdir/a1/f1
3219 echo "To simulate f2 lost MDT-object and OST-object1"
3220 do_facet mds1 $LCTL set_param fail_val=1
3221 rm -f $DIR/$tdir/a1/f2
3223 if [ $OSTCOUNT -gt 2 ]; then
3224 echo "To simulate f3 lost MDT-object and OST-object2"
3225 do_facet mds1 $LCTL set_param fail_val=2
3226 rm -f $DIR/$tdir/a1/f3
3229 umount_client $MOUNT
3232 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3234 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3235 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3237 for k in $(seq $MDSCOUNT); do
3238 # The LFSCK status query internal is 30 seconds. For the case
3239 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3240 # time to guarantee the status sync up.
3241 wait_update_facet mds${k} "$LCTL get_param -n \
3242 mdd.$(facet_svc mds${k}).lfsck_layout |
3243 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3244 error "(2) MDS${k} is not the expected 'completed'"
3247 for k in $(seq $OSTCOUNT); do
3248 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3249 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3250 awk '/^status/ { print $2 }')
3251 [ "$cur_status" == "completed" ] ||
3252 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3255 local repaired=$(do_facet mds1 $LCTL get_param -n \
3256 mdd.$(facet_svc mds1).lfsck_layout |
3257 awk '/^repaired_orphan/ { print $2 }')
3258 if [ $OSTCOUNT -gt 2 ]; then
3259 [ $repaired -eq 9 ] ||
3260 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3262 [ $repaired -eq 4 ] ||
3263 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3266 mount_client $MOUNT || error "(5.0) Fail to start client!"
3268 LOV_PATTERN_F_HOLE=0x40000000
3271 # ${fid0}-R-0 is the old f0
3273 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3274 echo "Check $name, which is the old f0"
3276 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3278 local pattern=$($LFS getstripe -L $name)
3279 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3280 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3282 local stripes=$($LFS getstripe -c $name)
3283 if [ $OSTCOUNT -gt 2 ]; then
3284 [ $stripes -eq 3 ] ||
3285 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3287 [ $stripes -eq 2 ] ||
3288 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3291 local size=$(stat $name | awk '/Size:/ { print $2 }')
3292 [ $size -eq $((4096 * $bcount)) ] ||
3293 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3295 cat $name > /dev/null || error "(5.5) cannot read $name"
3297 echo "dummy" >> $name || error "(5.6) cannot write $name"
3299 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3301 touch $name || error "(5.8) cannot touch $name"
3303 rm -f $name || error "(5.9) cannot unlink $name"
3306 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3308 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3309 if [ $OSTCOUNT -gt 2 ]; then
3310 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3312 echo "Check $name, it contains the old f1's stripe1"
3315 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3317 pattern=$($LFS getstripe -L $name)
3318 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3319 error "(6.2) expect pattern flag hole, but got $pattern"
3321 stripes=$($LFS getstripe -c $name)
3322 if [ $OSTCOUNT -gt 2 ]; then
3323 [ $stripes -eq 3 ] ||
3324 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3326 [ $stripes -eq 2 ] ||
3327 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3330 size=$(stat $name | awk '/Size:/ { print $2 }')
3331 [ $size -eq $((4096 * $bcount)) ] ||
3332 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3334 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3336 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3337 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3340 [ $failures -eq 256 ] ||
3341 error "(6.6) expect 256 IO failures, but get $failures"
3343 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3344 [ $size -eq $((4096 * $bcount)) ] ||
3345 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3347 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3348 error "(6.8) write to the LOV EA hole should fail"
3350 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3351 error "(6.9) write to normal stripe should NOT fail"
3353 echo "foo" >> $name && error "(6.10) append write $name should fail"
3355 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3357 touch $name || error "(6.12) cannot touch $name"
3359 rm -f $name || error "(6.13) cannot unlink $name"
3362 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3364 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3365 if [ $OSTCOUNT -gt 2 ]; then
3366 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3368 echo "Check $name, it contains the old f2's stripe0"
3371 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3373 pattern=$($LFS getstripe -L $name)
3374 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3375 error "(7.2) expect pattern flag hole, but got $pattern"
3377 stripes=$($LFS getstripe -c $name)
3378 size=$(stat $name | awk '/Size:/ { print $2 }')
3379 if [ $OSTCOUNT -gt 2 ]; then
3380 [ $stripes -eq 3 ] ||
3381 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3383 [ $size -eq $((4096 * $bcount)) ] ||
3384 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3386 cat $name > /dev/null &&
3387 error "(7.5.1) normal read $name should fail"
3389 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3390 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3392 [ $failures -eq 256 ] ||
3393 error "(7.6) expect 256 IO failures, but get $failures"
3395 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3396 [ $size -eq $((4096 * $bcount)) ] ||
3397 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3399 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3400 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3402 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3403 error "(7.8.1) write to normal stripe should NOT fail"
3405 echo "foo" >> $name &&
3406 error "(7.8.3) append write $name should fail"
3408 chown $RUNAS_ID:$RUNAS_GID $name ||
3409 error "(7.9.1) cannot chown on $name"
3411 touch $name || error "(7.10.1) cannot touch $name"
3413 [ $stripes -eq 2 ] ||
3414 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3417 [ $size -eq $((4096 * (256 + 0))) ] ||
3418 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3420 cat $name > /dev/null &&
3421 error "(7.5.2) normal read $name should fail"
3423 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3424 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3425 [ $failures -eq 256 ] ||
3426 error "(7.6.2) expect 256 IO failures, but get $failures"
3429 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3430 [ $size -eq $((4096 * $bcount)) ] ||
3431 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3433 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3434 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3436 chown $RUNAS_ID:$RUNAS_GID $name ||
3437 error "(7.9.2) cannot chown on $name"
3439 touch $name || error "(7.10.2) cannot touch $name"
3442 rm -f $name || error "(7.11) cannot unlink $name"
3444 [ $OSTCOUNT -le 2 ] && return
3447 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3449 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3450 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3452 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3454 pattern=$($LFS getstripe -L $name)
3455 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3456 error "(8.2) expect pattern flag hole, but got $pattern"
3458 stripes=$($LFS getstripe -c $name)
3459 [ $stripes -eq 3 ] ||
3460 error "(8.3) expect the stripe count is 3, but got $stripes"
3462 size=$(stat $name | awk '/Size:/ { print $2 }')
3464 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3465 error "(8.4) expect the size $((4096 * 512)), but got $size"
3467 cat $name > /dev/null &&
3468 error "(8.5) normal read $name should fail"
3470 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3471 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3473 [ $failures -eq 256 ] ||
3474 error "(8.6) expect 256 IO failures, but get $failures"
3477 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3478 [ $size -eq $((4096 * $bcount)) ] ||
3479 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3481 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3482 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3484 chown $RUNAS_ID:$RUNAS_GID $name ||
3485 error "(8.9) cannot chown on $name"
3487 touch $name || error "(8.10) cannot touch $name"
3489 rm -f $name || error "(8.11) cannot unlink $name"
3491 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3494 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3495 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3496 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3497 skip "MDS older than 2.5.55, LU-4887"
3500 echo "The target MDT-object and some of its OST-object are lost."
3501 echo "The LFSCK should find out the left OST-objects and re-create"
3502 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3503 echo "with the partial OST-objects (LOV EA hole)."
3505 echo "New client can access the file with LOV EA hole via normal"
3506 echo "system tools or commands without crash the system - PFL case."
3509 check_mount_and_prep
3511 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3512 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3513 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3514 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3515 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3516 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3518 local bcount=$((256 * 3 + 1))
3520 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3521 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3522 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3524 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3525 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3526 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3529 $LFS getstripe $DIR/$tdir/f0
3531 $LFS getstripe $DIR/$tdir/f1
3533 $LFS getstripe $DIR/$tdir/f2
3535 cancel_lru_locks mdc
3536 cancel_lru_locks osc
3538 echo "Inject failure..."
3539 echo "To simulate f0 lost MDT-object"
3540 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3544 echo "To simulate the case of f1 lost MDT-object and "
3545 echo "the first OST-object in each PFL component"
3546 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3547 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3550 echo "To simulate the case of f2 lost MDT-object and "
3551 echo "the second OST-object in each PFL component"
3552 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3557 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3559 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3560 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3562 for k in $(seq $MDSCOUNT); do
3563 # The LFSCK status query internal is 30 seconds. For the case
3564 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3565 # time to guarantee the status sync up.
3566 wait_update_facet mds${k} "$LCTL get_param -n \
3567 mdd.$(facet_svc mds${k}).lfsck_layout |
3568 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3569 error "(4) MDS${k} is not the expected 'completed'"
3572 for k in $(seq $OSTCOUNT); do
3573 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3574 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3575 awk '/^status/ { print $2 }')
3576 [ "$cur_status" == "completed" ] ||
3577 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3580 local repaired=$(do_facet mds1 $LCTL get_param -n \
3581 mdd.$(facet_svc mds1).lfsck_layout |
3582 awk '/^repaired_orphan/ { print $2 }')
3583 [ $repaired -eq 8 ] ||
3584 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3587 # ${fid0}-R-0 is the old f0
3589 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3590 echo "Check $name, which is the old f0"
3592 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3594 local pattern=$($LFS getstripe -L -I1 $name)
3595 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3596 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3598 pattern=$($LFS getstripe -L -I2 $name)
3599 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3600 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3602 local stripes=$($LFS getstripe -c -I1 $name)
3603 [ $stripes -eq 2 ] ||
3604 error "(7.3.1) expect 2 stripes, but got $stripes"
3606 stripes=$($LFS getstripe -c -I2 $name)
3607 [ $stripes -eq 2 ] ||
3608 error "(7.3.2) expect 2 stripes, but got $stripes"
3610 local e_start=$($LFS getstripe -I1 $name |
3611 awk '/lcme_extent.e_start:/ { print $2 }')
3612 [ $e_start -eq 0 ] ||
3613 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3615 local e_end=$($LFS getstripe -I1 $name |
3616 awk '/lcme_extent.e_end:/ { print $2 }')
3617 [ $e_end -eq 2097152 ] ||
3618 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3620 e_start=$($LFS getstripe -I2 $name |
3621 awk '/lcme_extent.e_start:/ { print $2 }')
3622 [ $e_start -eq 2097152 ] ||
3623 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3625 e_end=$($LFS getstripe -I2 $name |
3626 awk '/lcme_extent.e_end:/ { print $2 }')
3627 [ "$e_end" = "EOF" ] ||
3628 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3630 local size=$(stat $name | awk '/Size:/ { print $2 }')
3631 [ $size -eq $((4096 * $bcount)) ] ||
3632 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3634 cat $name > /dev/null || error "(7.7) cannot read $name"
3636 echo "dummy" >> $name || error "(7.8) cannot write $name"
3638 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3640 touch $name || error "(7.10) cannot touch $name"
3642 rm -f $name || error "(7.11) cannot unlink $name"
3645 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3647 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3648 echo "Check $name, it contains f1's second OST-object in each COMP"
3650 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3652 pattern=$($LFS getstripe -L -I1 $name)
3653 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3654 error "(8.2.1) expect pattern flag hole, but got $pattern"
3656 pattern=$($LFS getstripe -L -I2 $name)
3657 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3658 error "(8.2.2) expect pattern flag hole, but got $pattern"
3660 stripes=$($LFS getstripe -c -I1 $name)
3661 [ $stripes -eq 2 ] ||
3662 error "(8.3.2) expect 2 stripes, but got $stripes"
3664 stripes=$($LFS getstripe -c -I2 $name)
3665 [ $stripes -eq 2 ] ||
3666 error "(8.3.2) expect 2 stripes, but got $stripes"
3668 e_start=$($LFS getstripe -I1 $name |
3669 awk '/lcme_extent.e_start:/ { print $2 }')
3670 [ $e_start -eq 0 ] ||
3671 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3673 e_end=$($LFS getstripe -I1 $name |
3674 awk '/lcme_extent.e_end:/ { print $2 }')
3675 [ $e_end -eq 2097152 ] ||
3676 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3678 e_start=$($LFS getstripe -I2 $name |
3679 awk '/lcme_extent.e_start:/ { print $2 }')
3680 [ $e_start -eq 2097152 ] ||
3681 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3683 e_end=$($LFS getstripe -I2 $name |
3684 awk '/lcme_extent.e_end:/ { print $2 }')
3685 [ "$e_end" = "EOF" ] ||
3686 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3688 size=$(stat $name | awk '/Size:/ { print $2 }')
3689 [ $size -eq $((4096 * $bcount)) ] ||
3690 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3692 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3694 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3695 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3697 # The first stripe in each COMP was lost
3698 [ $failures -eq 512 ] ||
3699 error "(8.8) expect 512 IO failures, but get $failures"
3701 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3702 [ $size -eq $((4096 * $bcount)) ] ||
3703 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3705 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3706 error "(8.10) write to the LOV EA hole should fail"
3708 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3709 error "(8.11) write to normal stripe should NOT fail"
3711 echo "foo" >> $name && error "(8.12) append write $name should fail"
3713 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3715 touch $name || error "(8.14) cannot touch $name"
3717 rm -f $name || error "(8.15) cannot unlink $name"
3720 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3722 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3723 echo "Check $name, it contains f2's first stripe in each COMP"
3725 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3727 pattern=$($LFS getstripe -L -I1 $name)
3728 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3729 error "(9.2.1) expect pattern flag hole, but got $pattern"
3731 pattern=$($LFS getstripe -L -I2 $name)
3732 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3733 error "(9.2.2) expect pattern flag hole, but got $pattern"
3735 stripes=$($LFS getstripe -c -I1 $name)
3736 [ $stripes -eq 2 ] ||
3737 error "(9.3.2) expect 2 stripes, but got $stripes"
3739 stripes=$($LFS getstripe -c -I2 $name)
3740 [ $stripes -eq 2 ] ||
3741 error "(9.3.2) expect 2 stripes, but got $stripes"
3743 e_start=$($LFS getstripe -I1 $name |
3744 awk '/lcme_extent.e_start:/ { print $2 }')
3745 [ $e_start -eq 0 ] ||
3746 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3748 e_end=$($LFS getstripe -I1 $name |
3749 awk '/lcme_extent.e_end:/ { print $2 }')
3750 [ $e_end -eq 2097152 ] ||
3751 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3753 e_start=$($LFS getstripe -I2 $name |
3754 awk '/lcme_extent.e_start:/ { print $2 }')
3755 [ $e_start -eq 2097152 ] ||
3756 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3758 e_end=$($LFS getstripe -I2 $name |
3759 awk '/lcme_extent.e_end:/ { print $2 }')
3760 [ "$e_end" = "EOF" ] ||
3761 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3763 size=$(stat $name | awk '/Size:/ { print $2 }')
3764 # The second stripe in COMP was lost, so we do not know there
3765 # have ever been some data before. 'stat' will regard it as
3766 # no data on the lost stripe.
3768 [ $size -eq $((4096 * $bcount)) ] ||
3769 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3771 cat $name > /dev/null &&
3772 error "(9.7) normal read $name should fail"
3774 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3775 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3776 [ $failures -eq 512 ] ||
3777 error "(9.8) expect 256 IO failures, but get $failures"
3779 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3780 # The second stripe in COMP was lost, so we do not know there
3781 # have ever been some data before. Since 'dd' skip failure,
3782 # it will regard the lost stripe contains data.
3784 [ $size -eq $((4096 * $bcount)) ] ||
3785 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3787 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3788 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3790 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3791 error "(9.11) write to normal stripe should NOT fail"
3793 echo "foo" >> $name &&
3794 error "(9.12) append write $name should fail"
3796 chown $RUNAS_ID:$RUNAS_GID $name ||
3797 error "(9.13) cannot chown on $name"
3799 touch $name || error "(9.14) cannot touch $name"
3801 rm -f $name || error "(7.15) cannot unlink $name"
3803 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3806 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3807 skip "MDS older than 2.5.59, LU-4887"
3809 check_mount_and_prep
3810 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3812 echo "Start all LFSCK components by default (-s 1)"
3813 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3814 error "Fail to start LFSCK"
3816 echo "namespace LFSCK should be in 'scanning-phase1' status"
3817 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3818 [ "$STATUS" == "scanning-phase1" ] ||
3819 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3821 echo "layout LFSCK should be in 'scanning-phase1' status"
3822 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3823 [ "$STATUS" == "scanning-phase1" ] ||
3824 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3826 echo "Stop all LFSCK components by default"
3827 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3828 error "Fail to stop LFSCK"
3830 run_test 21 "run all LFSCK components by default"
3833 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3834 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3835 skip "MDS older than 2.6.50, LU-5511"
3838 echo "The parent_A references the child directory via some name entry,"
3839 echo "but the child directory back references another parent_B via its"
3840 echo "".." name entry. The parent_B does not exist. Then the namespace"
3841 echo "LFSCK will repair the child directory's ".." name entry."
3844 check_mount_and_prep
3846 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3847 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3849 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3850 echo "The dummy's dotdot name entry references the guard."
3851 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3853 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3854 error "(3) Fail to mkdir on MDT0"
3855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3857 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3859 echo "Trigger namespace LFSCK to repair unmatched pairs"
3860 $START_NAMESPACE -A -r ||
3861 error "(5) Fail to start LFSCK for namespace"
3863 wait_all_targets_blocked namespace completed 6
3865 local repaired=$($SHOW_NAMESPACE |
3866 awk '/^unmatched_pairs_repaired/ { print $2 }')
3867 [ $repaired -eq 1 ] ||
3868 error "(7) Fail to repair unmatched pairs: $repaired"
3870 echo "'ls' should success after namespace LFSCK repairing"
3871 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3872 error "(8) ls should success."
3874 run_test 22a "LFSCK can repair unmatched pairs (1)"
3877 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3878 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3879 skip "MDS older than 2.6.50, LU-5511"
3882 echo "The parent_A references the child directory via the name entry_B,"
3883 echo "but the child directory back references another parent_C via its"
3884 echo "".." name entry. The parent_C exists, but there is no the name"
3885 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3886 echo "the child directory's ".." name entry and its linkEA."
3889 check_mount_and_prep
3891 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3892 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3894 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3895 echo "and bad linkEA. The dummy's dotdot name entry references the"
3896 echo "guard. The dummy's linkEA references n non-exist name entry."
3897 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3898 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3899 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3900 error "(3) Fail to mkdir on MDT0"
3901 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3903 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3904 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3905 local dummyname=$($LFS fid2path $DIR $dummyfid)
3906 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3907 error "(4) fid2path works unexpectedly."
3909 echo "Trigger namespace LFSCK to repair unmatched pairs"
3910 $START_NAMESPACE -A -r ||
3911 error "(5) Fail to start LFSCK for namespace"
3913 wait_all_targets_blocked namespace completed 6
3915 local repaired=$($SHOW_NAMESPACE |
3916 awk '/^unmatched_pairs_repaired/ { print $2 }')
3917 [ $repaired -eq 1 ] ||
3918 error "(7) Fail to repair unmatched pairs: $repaired"
3920 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3921 local dummyname=$($LFS fid2path $DIR $dummyfid)
3922 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3923 error "(8) fid2path does not work"
3925 run_test 22b "LFSCK can repair unmatched pairs (2)"
3928 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3929 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3930 skip "MDS older than 2.6.50, LU-5512"
3933 echo "The name entry is there, but the MDT-object for such name "
3934 echo "entry does not exist. The namespace LFSCK should find out "
3935 echo "and repair the inconsistency as required."
3938 check_mount_and_prep
3940 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3941 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3943 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3944 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3945 do_facet mds2 $LCTL set_param fail_loc=0x1620
3946 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3947 do_facet mds2 $LCTL set_param fail_loc=0
3949 echo "'ls' should fail because of dangling name entry"
3950 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3952 echo "Trigger namespace LFSCK to find out dangling name entry"
3953 $START_NAMESPACE -A -r ||
3954 error "(5) Fail to start LFSCK for namespace"
3956 wait_all_targets_blocked namespace completed 6
3958 local repaired=$($SHOW_NAMESPACE |
3959 awk '/^dangling_repaired/ { print $2 }')
3960 [ $repaired -eq 1 ] ||
3961 error "(7) Fail to repair dangling name entry: $repaired"
3963 echo "'ls' should fail because not re-create MDT-object by default"
3964 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3966 echo "Trigger namespace LFSCK again to repair dangling name entry"
3967 $START_NAMESPACE -A -r -C ||
3968 error "(9) Fail to start LFSCK for namespace"
3970 wait_all_targets_blocked namespace completed 10
3972 repaired=$($SHOW_NAMESPACE |
3973 awk '/^dangling_repaired/ { print $2 }')
3974 [ $repaired -eq 1 ] ||
3975 error "(11) Fail to repair dangling name entry: $repaired"
3977 echo "'ls' should success after namespace LFSCK repairing"
3978 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3980 run_test 23a "LFSCK can repair dangling name entry (1)"
3983 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3984 skip "MDS older than 2.6.50, LU-5512"
3987 echo "The objectA has multiple hard links, one of them corresponding"
3988 echo "to the name entry_B. But there is something wrong for the name"
3989 echo "entry_B and cause entry_B to references non-exist object_C."
3990 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3991 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3992 echo "comes to the second-stage scanning, it will find that the"
3993 echo "former re-creating object_C is not proper, and will try to"
3994 echo "replace the object_C with the real object_A."
3997 check_mount_and_prep
3999 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4000 $LFS path2fid $DIR/$tdir/d0
4002 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4004 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4005 $LFS path2fid $DIR/$tdir/d0/f0
4007 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4008 $LFS path2fid $DIR/$tdir/d0/f1
4010 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4011 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4013 if [ "$SEQ0" != "$SEQ1" ]; then
4014 # To guarantee that the f0 and f1 are in the same FID seq
4015 rm -f $DIR/$tdir/d0/f0 ||
4016 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4017 echo "dummy" > $DIR/$tdir/d0/f0 ||
4018 error "(3.2) Fail to touch on MDT0"
4019 $LFS path2fid $DIR/$tdir/d0/f0
4022 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4023 OID=$(printf %d $OID)
4025 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4026 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4027 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4028 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4029 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4031 # If there is creation after the dangling injection, it may re-use
4032 # the just released local object (inode) that is referenced by the
4033 # dangling name entry. It will fail the dangling injection.
4034 # So before deleting the target object for the dangling name entry,
4035 # remove some other objects to avoid the target object being reused
4036 # by some potential creations. LU-7429
4037 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4039 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4041 echo "'ls' should fail because of dangling name entry"
4042 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4043 error "(6) ls should fail."
4045 echo "Trigger namespace LFSCK to find out dangling name entry"
4046 $START_NAMESPACE -r -C ||
4047 error "(7) Fail to start LFSCK for namespace"
4049 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4050 mdd.${MDT_DEV}.lfsck_namespace |
4051 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4053 error "(8) unexpected status"
4056 local repaired=$($SHOW_NAMESPACE |
4057 awk '/^dangling_repaired/ { print $2 }')
4058 [ $repaired -eq 1 ] ||
4059 error "(9) Fail to repair dangling name entry: $repaired"
4061 repaired=$($SHOW_NAMESPACE |
4062 awk '/^multiple_linked_repaired/ { print $2 }')
4063 [ $repaired -eq 1 ] ||
4064 error "(10) Fail to drop the former created object: $repaired"
4066 local data=$(cat $DIR/$tdir/d0/foo)
4067 [ "$data" == "dummy" ] ||
4068 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4070 run_test 23b "LFSCK can repair dangling name entry (2)"
4073 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4074 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4075 mdd.${MDT_DEV}.lfsck_namespace |
4076 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4078 error "(10) unexpected status"
4081 stop_full_debug_logging
4085 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4086 skip "MDS older than 2.6.50, LU-5512"
4089 echo "The objectA has multiple hard links, one of them corresponding"
4090 echo "to the name entry_B. But there is something wrong for the name"
4091 echo "entry_B and cause entry_B to references non-exist object_C."
4092 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4093 echo "as dangling, and re-create the lost object_C. And then others"
4094 echo "modified the re-created object_C. When the LFSCK comes to the"
4095 echo "second-stage scanning, it will find that the former re-creating"
4096 echo "object_C maybe wrong and try to replace the object_C with the"
4097 echo "real object_A. But because object_C has been modified, so the"
4098 echo "LFSCK cannot replace it."
4101 start_full_debug_logging
4103 check_mount_and_prep
4105 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4106 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4107 echo "parent_fid=$parent_fid"
4109 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4111 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4112 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4113 echo "f0_fid=$f0_fid"
4115 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4116 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4117 echo "f1_fid=$f1_fid"
4119 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4120 # To guarantee that the f0 and f1 are in the same FID seq
4121 rm -f $DIR/$tdir/d0/f0 ||
4122 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4123 echo "dummy" > $DIR/$tdir/d0/f0 ||
4124 error "(3.2) Fail to touch on MDT0"
4125 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4126 echo "f0_fid=$f0_fid (replaced)"
4129 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4131 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4132 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4133 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4134 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4135 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4137 # If there is creation after the dangling injection, it may re-use
4138 # the just released local object (inode) that is referenced by the
4139 # dangling name entry. It will fail the dangling injection.
4140 # So before deleting the target object for the dangling name entry,
4141 # remove some other objects to avoid the target object being reused
4142 # by some potential creations. LU-7429
4143 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4145 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4147 echo "'ls' should fail because of dangling name entry"
4148 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4149 error "(6) ls should fail."
4151 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4152 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4154 echo "Trigger namespace LFSCK to find out dangling name entry"
4155 $START_NAMESPACE -r -C ||
4156 error "(7) Fail to start LFSCK for namespace"
4158 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4159 # While unexpected by the test, it is valid for LFSCK to repair
4160 # the link to the original object before any data is written.
4161 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4163 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4164 log "LFSCK repaired file prematurely"
4169 stat $DIR/$tdir/d0/foo
4171 error "(8) unexpected size"
4174 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4175 cancel_lru_locks osc
4179 local repaired=$($SHOW_NAMESPACE |
4180 awk '/^dangling_repaired/ { print $2 }')
4181 [ $repaired -eq 1 ] ||
4182 error "(11) Fail to repair dangling name entry: $repaired"
4184 local data=$(cat $DIR/$tdir/d0/foo)
4185 [ "$data" != "dummy" ] ||
4186 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4188 run_test 23c "LFSCK can repair dangling name entry (3)"
4191 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4192 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4193 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4194 skip "MDS older than 2.6.50, LU-5513"
4197 echo "Two MDT-objects back reference the same name entry via their"
4198 echo "each own linkEA entry, but the name entry only references one"
4199 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4200 echo "for the MDT-object that is not recognized. If such MDT-object"
4201 echo "has no other linkEA entry after the removing, then the LFSCK"
4202 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4205 check_mount_and_prep
4207 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4209 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4210 $LFS path2fid $DIR/$tdir/d0/guard
4212 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4213 $LFS path2fid $DIR/$tdir/d0/dummy
4216 if [ $mds1_FSTYPE != ldiskfs ]; then
4217 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4219 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4222 touch $DIR/$tdir/d0/guard/foo ||
4223 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4225 echo "Inject failure stub on MDT0 to simulate the case that"
4226 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4227 echo "that references $DIR/$tdir/d0/guard/foo."
4228 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4229 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4230 echo "there with the same linkEA entry as another MDT-object"
4231 echo "$DIR/$tdir/d0/guard/foo has"
4233 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4235 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4236 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4237 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4238 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4239 rmdir $DIR/$tdir/d0/dummy/foo ||
4240 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4243 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4244 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4245 error "(6) stat successfully unexpectedly"
4247 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4248 $START_NAMESPACE -A -r ||
4249 error "(7) Fail to start LFSCK for namespace"
4251 wait_all_targets_blocked namespace completed 8
4253 local repaired=$($SHOW_NAMESPACE |
4254 awk '/^multiple_referenced_repaired/ { print $2 }')
4255 [ $repaired -eq 1 ] ||
4256 error "(9) Fail to repair multiple referenced name entry: $repaired"
4258 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4259 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4260 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4262 local cname="$cfid-$pfid-D-0"
4263 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4264 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4266 run_test 24 "LFSCK can repair multiple-referenced name entry"
4269 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4270 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4271 skip "MDS older than 2.6.50, LU-5515"
4274 echo "The file type in the name entry does not match the file type"
4275 echo "claimed by the referenced object. Then the LFSCK will update"
4276 echo "the file type in the name entry."
4279 check_mount_and_prep
4281 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4283 echo "Inject failure stub on MDT0 to simulate the case that"
4284 echo "the file type stored in the name entry is wrong."
4286 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4287 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4288 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4291 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4292 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4294 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4295 mdd.${MDT_DEV}.lfsck_namespace |
4296 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4298 error "(4) unexpected status"
4301 local repaired=$($SHOW_NAMESPACE |
4302 awk '/^bad_file_type_repaired/ { print $2 }')
4303 [ $repaired -eq 1 ] ||
4304 error "(5) Fail to repair bad file type in name entry: $repaired"
4306 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4308 run_test 25 "LFSCK can repair bad file type in the name entry"
4311 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4312 skip "MDS older than 2.6.50, LU-5516"
4315 echo "The local name entry back referenced by the MDT-object is lost."
4316 echo "The namespace LFSCK will add the missing local name entry back"
4317 echo "to the normal namespace."
4320 check_mount_and_prep
4322 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4323 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4324 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4326 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4327 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4329 echo "Inject failure stub on MDT0 to simulate the case that"
4330 echo "foo's name entry will be removed, but the foo's object"
4331 echo "and its linkEA are kept in the system."
4333 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4335 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4338 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4339 error "(5) 'ls' should fail"
4341 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4342 $START_NAMESPACE -r -A ||
4343 error "(6) Fail to start LFSCK for namespace"
4345 wait_all_targets_blocked namespace completed 7
4347 local repaired=$($SHOW_NAMESPACE |
4348 awk '/^lost_dirent_repaired/ { print $2 }')
4349 [ $repaired -eq 1 ] ||
4350 error "(8) Fail to repair lost dirent: $repaired"
4352 ls -ail $DIR/$tdir/d0/foo ||
4353 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4355 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4356 [ "$foofid" == "$foofid2" ] ||
4357 error "(10) foo's FID changed: $foofid, $foofid2"
4359 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4362 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4363 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4364 skip "MDS older than 2.6.50, LU-5516"
4367 echo "The remote name entry back referenced by the MDT-object is lost."
4368 echo "The namespace LFSCK will add the missing remote name entry back"
4369 echo "to the normal namespace."
4372 check_mount_and_prep
4374 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4375 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4376 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4378 echo "Inject failure stub on MDT0 to simulate the case that"
4379 echo "foo's name entry will be removed, but the foo's object"
4380 echo "and its linkEA are kept in the system."
4382 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4383 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4384 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4385 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4387 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4388 error "(4) 'ls' should fail"
4390 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4391 $START_NAMESPACE -r -A ||
4392 error "(5) Fail to start LFSCK for namespace"
4394 wait_all_targets_blocked namespace completed 6
4396 local repaired=$($SHOW_NAMESPACE |
4397 awk '/^lost_dirent_repaired/ { print $2 }')
4398 [ $repaired -eq 1 ] ||
4399 error "(7) Fail to repair lost dirent: $repaired"
4401 ls -ail $DIR/$tdir/d0/foo ||
4402 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4404 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4405 [ "$foofid" == "$foofid2" ] ||
4406 error "(9) foo's FID changed: $foofid, $foofid2"
4408 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4411 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4412 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4413 skip "MDS older than 2.6.50, LU-5516"
4416 echo "The local parent referenced by the MDT-object linkEA is lost."
4417 echo "The namespace LFSCK will re-create the lost parent as orphan."
4420 check_mount_and_prep
4422 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4423 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4424 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4425 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4427 echo "Inject failure stub on MDT0 to simulate the case that"
4428 echo "foo's name entry will be removed, but the foo's object"
4429 echo "and its linkEA are kept in the system. And then remove"
4430 echo "another hard link and the parent directory."
4432 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4433 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4434 rm -f $DIR/$tdir/d0/foo ||
4435 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4436 rm -f $DIR/$tdir/d0/dummy ||
4437 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4440 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4441 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4443 echo "Trigger namespace LFSCK to repair the lost parent"
4444 $START_NAMESPACE -r -A ||
4445 error "(6) Fail to start LFSCK for namespace"
4447 wait_all_targets_blocked namespace completed 7
4449 local repaired=$($SHOW_NAMESPACE |
4450 awk '/^lost_dirent_repaired/ { print $2 }')
4451 [ $repaired -eq 1 ] ||
4452 error "(8) Fail to repair lost dirent: $repaired"
4454 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4455 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4456 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4458 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4460 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4461 [ ! -z "$cname" ] ||
4462 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4464 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4467 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4468 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4469 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4470 skip "MDS older than 2.6.50, LU-5516"
4473 echo "The remote parent referenced by the MDT-object linkEA is lost."
4474 echo "The namespace LFSCK will re-create the lost parent as orphan."
4477 check_mount_and_prep
4479 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4480 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4482 $LFS path2fid $DIR/$tdir/d0
4484 echo "Inject failure stub on MDT0 to simulate the case that"
4485 echo "foo's name entry will be removed, but the foo's object"
4486 echo "and its linkEA are kept in the system. And then remove"
4487 echo "the parent directory."
4489 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4491 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4494 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4495 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4497 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4498 $START_NAMESPACE -r -A ||
4499 error "(6) Fail to start LFSCK for namespace"
4501 wait_all_targets_blocked namespace completed 7
4503 local repaired=$($SHOW_NAMESPACE |
4504 awk '/^lost_dirent_repaired/ { print $2 }')
4505 [ $repaired -eq 1 ] ||
4506 error "(8) Fail to repair lost dirent: $repaired"
4508 ls -ail $MOUNT/.lustre/lost+found/
4510 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4511 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4512 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4514 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4516 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4517 [ ! -z "$cname" ] ||
4518 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4520 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4523 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4524 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4525 skip "MDS older than 2.6.50, LU-5506"
4528 echo "The target name entry is lost. The LFSCK should insert the"
4529 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4530 echo "the MDT (on which the orphan MDT-object resides) has ever"
4531 echo "failed to respond some name entry verification during the"
4532 echo "first stage-scanning, then the LFSCK should skip to handle"
4533 echo "orphan MDT-object on this MDT. But other MDTs should not"
4537 check_mount_and_prep
4538 $LFS mkdir -i 0 $DIR/$tdir/d1
4539 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4540 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4542 $LFS mkdir -i 1 $DIR/$tdir/d2
4543 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4544 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4546 echo "Inject failure stub on MDT0 to simulate the case that"
4547 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4548 echo "and its linkEA are kept in the system. And the case that"
4549 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4550 echo "and its linkEA are kept in the system."
4552 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4553 do_facet mds1 $LCTL set_param fail_loc=0x1624
4554 do_facet mds2 $LCTL set_param fail_loc=0x1624
4555 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4556 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4557 do_facet mds1 $LCTL set_param fail_loc=0
4558 do_facet mds2 $LCTL set_param fail_loc=0
4560 cancel_lru_locks mdc
4561 cancel_lru_locks osc
4563 echo "Inject failure, to simulate the MDT0 fail to handle"
4564 echo "MDT1 LFSCK request during the first-stage scanning."
4565 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4566 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4568 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4569 $START_NAMESPACE -r -A ||
4570 error "(3) Fail to start LFSCK for namespace"
4572 wait_update_facet mds1 "$LCTL get_param -n \
4573 mdd.$(facet_svc mds1).lfsck_namespace |
4574 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4575 error "(4) mds1 is not the expected 'partial'"
4578 wait_update_facet mds2 "$LCTL get_param -n \
4579 mdd.$(facet_svc mds2).lfsck_namespace |
4580 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4581 error "(5) mds2 is not the expected 'completed'"
4584 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4586 local repaired=$(do_facet mds1 $LCTL get_param -n \
4587 mdd.$(facet_svc mds1).lfsck_namespace |
4588 awk '/^lost_dirent_repaired/ { print $2 }')
4589 [ $repaired -eq 0 ] ||
4590 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4592 repaired=$(do_facet mds2 $LCTL get_param -n \
4593 mdd.$(facet_svc mds2).lfsck_namespace |
4594 awk '/^lost_dirent_repaired/ { print $2 }')
4595 [ $repaired -eq 1 ] ||
4596 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4598 echo "Trigger namespace LFSCK on all devices again to cleanup"
4599 $START_NAMESPACE -r -A ||
4600 error "(8) Fail to start LFSCK for namespace"
4602 wait_all_targets_blocked namespace completed 9
4604 local repaired=$(do_facet mds1 $LCTL get_param -n \
4605 mdd.$(facet_svc mds1).lfsck_namespace |
4606 awk '/^lost_dirent_repaired/ { print $2 }')
4607 [ $repaired -eq 1 ] ||
4608 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4610 repaired=$(do_facet mds2 $LCTL get_param -n \
4611 mdd.$(facet_svc mds2).lfsck_namespace |
4612 awk '/^lost_dirent_repaired/ { print $2 }')
4613 [ $repaired -eq 0 ] ||
4614 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4616 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4619 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4620 skip "MDS older than 2.6.50, LU-5517"
4623 echo "The object's nlink attribute is larger than the object's known"
4624 echo "name entries count. The LFSCK will repair the object's nlink"
4625 echo "attribute to match the known name entries count"
4628 check_mount_and_prep
4630 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4631 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4633 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4634 echo "nlink attribute is larger than its name entries count."
4636 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4637 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4638 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4639 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4642 cancel_lru_locks mdc
4643 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4644 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4646 echo "Trigger namespace LFSCK to repair the nlink count"
4647 $START_NAMESPACE -r -A ||
4648 error "(5) Fail to start LFSCK for namespace"
4650 wait_all_targets_blocked namespace completed 6
4652 local repaired=$($SHOW_NAMESPACE |
4653 awk '/^nlinks_repaired/ { print $2 }')
4654 [ $repaired -eq 1 ] ||
4655 error "(7) Fail to repair nlink count: $repaired"
4657 cancel_lru_locks mdc
4658 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4659 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4661 # Disable 29a, we only allow nlink to be updated if the known linkEA
4662 # entries is larger than nlink count.
4664 #run_test 29a "LFSCK can repair bad nlink count (1)"
4667 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4668 skip "MDS older than 2.6.50, LU-5517"
4671 echo "The object's nlink attribute is smaller than the object's known"
4672 echo "name entries count. The LFSCK will repair the object's nlink"
4673 echo "attribute to match the known name entries count"
4676 check_mount_and_prep
4678 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4679 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4681 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4682 echo "nlink attribute is smaller than its name entries count."
4684 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4686 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4687 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4688 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4690 cancel_lru_locks mdc
4691 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4692 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4694 echo "Trigger namespace LFSCK to repair the nlink count"
4695 $START_NAMESPACE -r -A ||
4696 error "(5) Fail to start LFSCK for namespace"
4698 wait_all_targets_blocked namespace completed 6
4700 local repaired=$($SHOW_NAMESPACE |
4701 awk '/^nlinks_repaired/ { print $2 }')
4702 [ $repaired -eq 1 ] ||
4703 error "(7) Fail to repair nlink count: $repaired"
4705 cancel_lru_locks mdc
4706 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4707 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4709 run_test 29b "LFSCK can repair bad nlink count (2)"
4713 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4714 skip "MDS older than 2.6.50, LU-5517"
4717 echo "The namespace LFSCK will create many hard links to the target"
4718 echo "file as to exceed the linkEA size limitation. Under such case"
4719 echo "the linkEA will be marked as overflow that will prevent the"
4720 echo "target file to be migrated. Then remove some hard links to"
4721 echo "make the left hard links to be held within the linkEA size"
4722 echo "limitation. But before the namespace LFSCK adding all the"
4723 echo "missed linkEA entries back, the overflow mark (timestamp)"
4724 echo "will not be cleared."
4727 check_mount_and_prep
4729 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4730 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4731 error "(0.2) Fail to mkdir"
4732 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4733 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4735 # define MAX_LINKEA_SIZE 4096
4736 # sizeof(link_ea_header) = 24
4737 # sizeof(link_ea_entry) = 18
4738 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4739 # (sizeof(link_ea_entry) + name_length))
4740 # If the average name length is 12 bytes, then 150 hard links
4741 # is totally enough to overflow the linkEA
4742 echo "Create 150 hard links should succeed although the linkEA overflow"
4743 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4744 error "(2) Fail to hard link"
4746 cancel_lru_locks mdc
4747 if [ $MDSCOUNT -ge 2 ]; then
4748 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4749 error "(3.1) Migrate should fail"
4751 echo "The object with linkEA overflow should NOT be migrated"
4752 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4753 [ "$newfid" == "$oldfid" ] ||
4754 error "(3.2) Migrate should fail: $newfid != $oldfid"
4757 # Remove 100 hard links, then the linkEA should have space
4758 # to hold the missed linkEA entries.
4759 echo "Remove 100 hard links to save space for the missed linkEA entries"
4760 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4762 if [ $MDSCOUNT -ge 2 ]; then
4763 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4764 error "(5.1) Migrate should fail"
4766 # The overflow timestamp is still there, so migration will fail.
4767 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4768 [ "$newfid" == "$oldfid" ] ||
4769 error "(5.2) Migrate should fail: $newfid != $oldfid"
4772 # sleep 3 seconds to guarantee that the overflow is recognized
4775 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4776 $START_NAMESPACE -r -A ||
4777 error "(6) Fail to start LFSCK for namespace"
4779 wait_all_targets_blocked namespace completed 7
4781 local repaired=$($SHOW_NAMESPACE |
4782 awk '/^linkea_overflow_cleared/ { print $2 }')
4783 [ $repaired -eq 1 ] ||
4784 error "(8) Fail to clear linkea overflow: $repaired"
4786 repaired=$($SHOW_NAMESPACE |
4787 awk '/^nlinks_repaired/ { print $2 }')
4788 [ $repaired -eq 0 ] ||
4789 error "(9) Unexpected nlink repaired: $repaired"
4791 if [ $MDSCOUNT -ge 2 ]; then
4792 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4793 error "(10.1) Migrate failure"
4795 # Migration should succeed after clear the overflow timestamp.
4796 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4797 [ "$newfid" != "$oldfid" ] ||
4798 error "(10.2) Migrate should succeed"
4800 ls -l $DIR/$tdir/foo > /dev/null ||
4801 error "(11) 'ls' failed after migration"
4804 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4805 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4807 run_test 29c "verify linkEA size limitation"
4810 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4811 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4812 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4813 skip "MDS older than 2.6.50, LU-5518"
4816 echo "The namespace LFSCK will move the orphans from backend"
4817 echo "/lost+found directory to normal client visible namespace"
4818 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4821 check_mount_and_prep
4823 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4824 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4826 echo "Inject failure stub on MDT0 to simulate the case that"
4827 echo "directory d0 has no linkEA entry, then the LFSCK will"
4828 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4830 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4831 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4832 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4835 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4836 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4838 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4839 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4841 echo "Inject failure stub on MDT0 to simulate the case that the"
4842 echo "object's name entry will be removed, but not destroy the"
4843 echo "object. Then backend e2fsck will handle it as orphan and"
4844 echo "add them into the backend /lost+found directory."
4846 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4847 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4848 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4849 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4850 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4851 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4854 umount_client $MOUNT || error "(10) Fail to stop client!"
4856 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4858 local dev=$(facet_device $SINGLEMDS)
4860 echo "run e2fsck on $SINGLEMDS"
4861 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4862 error "(12) Fail to run e2fsck"
4864 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4866 echo "Trigger namespace LFSCK to recover backend orphans"
4867 $START_NAMESPACE -r -A ||
4868 error "(14) Fail to start LFSCK for namespace"
4870 wait_all_targets_blocked namespace completed 15
4872 local repaired=$($SHOW_NAMESPACE |
4873 awk '/^local_lost_found_moved/ { print $2 }')
4874 [ $repaired -ge 4 ] ||
4875 error "(16) Fail to recover backend orphans: $repaired"
4877 mount_client $MOUNT || error "(17) Fail to start client!"
4879 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4881 ls -ail $MOUNT/.lustre/lost+found/
4883 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4884 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4885 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4887 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4889 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4890 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4892 stat ${cname}/d1 || error "(21) d1 is not recovered"
4893 stat ${cname}/f1 || error "(22) f1 is not recovered"
4895 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4898 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4899 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4900 skip "MDS older than 2.6.50, LU-5519"
4903 echo "For the name entry under a striped directory, if the name"
4904 echo "hash does not match the shard, then the LFSCK will repair"
4905 echo "the bad name entry"
4908 check_mount_and_prep
4910 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4911 error "(1) Fail to create striped directory"
4913 echo "Inject failure stub on client to simulate the case that"
4914 echo "some name entry should be inserted into other non-first"
4915 echo "shard, but inserted into the first shard by wrong"
4917 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4918 $LCTL set_param fail_loc=0x1628 fail_val=0
4919 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4920 error "(2) Fail to create file under striped directory"
4921 $LCTL set_param fail_loc=0 fail_val=0
4923 echo "Trigger namespace LFSCK to repair bad name hash"
4924 $START_NAMESPACE -r -A ||
4925 error "(3) Fail to start LFSCK for namespace"
4927 wait_all_targets_blocked namespace completed 4
4929 local repaired=$($SHOW_NAMESPACE |
4930 awk '/^name_hash_repaired/ { print $2 }')
4931 [ $repaired -ge 1 ] ||
4932 error "(5) Fail to repair bad name hash: $repaired"
4934 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4936 error "Fail to find flag bad type: $rc"
4938 umount_client $MOUNT || error "(6) umount failed"
4939 mount_client $MOUNT || error "(7) mount failed"
4941 for ((i = 0; i < $MDSCOUNT; i++)); do
4942 stat $DIR/$tdir/striped_dir/d$i ||
4943 error "(8) Fail to stat d$i after LFSCK"
4944 rmdir $DIR/$tdir/striped_dir/d$i ||
4945 error "(9) Fail to unlink d$i after LFSCK"
4948 rmdir $DIR/$tdir/striped_dir ||
4949 error "(10) Fail to remove the striped directory after LFSCK"
4951 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4954 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4955 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4956 skip "MDS older than 2.6.50, LU-5519"
4959 echo "For the name entry under a striped directory, if the name"
4960 echo "hash does not match the shard, then the LFSCK will repair"
4961 echo "the bad name entry"
4964 check_mount_and_prep
4966 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4967 error "(1) Fail to create striped directory"
4969 echo "Inject failure stub on client to simulate the case that"
4970 echo "some name entry should be inserted into other non-second"
4971 echo "shard, but inserted into the secod shard by wrong"
4973 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4974 $LCTL set_param fail_loc=0x1628 fail_val=1
4975 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4976 error "(2) Fail to create file under striped directory"
4977 $LCTL set_param fail_loc=0 fail_val=0
4979 echo "Trigger namespace LFSCK to repair bad name hash"
4980 $START_NAMESPACE -r -A ||
4981 error "(3) Fail to start LFSCK for namespace"
4983 wait_all_targets_blocked namespace completed 4
4985 local repaired=$(do_facet mds2 $LCTL get_param -n \
4986 mdd.$(facet_svc mds2).lfsck_namespace |
4987 awk '/^name_hash_repaired/ { print $2 }')
4988 echo "repaired $repaired name entries with bad hash"
4989 [ $repaired -ge 1 ] ||
4990 error "(5) Fail to repair bad name hash: $repaired"
4992 umount_client $MOUNT || error "(6) umount failed"
4993 mount_client $MOUNT || error "(7) mount failed"
4995 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4996 stat $DIR/$tdir/striped_dir/d$i ||
4997 error "(8) Fail to stat d$i after LFSCK"
4998 rmdir $DIR/$tdir/striped_dir/d$i ||
4999 error "(9) Fail to unlink d$i after LFSCK"
5002 rmdir $DIR/$tdir/striped_dir ||
5003 error "(10) Fail to remove the striped directory after LFSCK"
5005 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5008 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5009 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5010 skip "MDS older than 2.6.50, LU-5519"
5013 echo "For some reason, the master MDT-object of the striped directory"
5014 echo "may lost its master LMV EA. If nobody created files under the"
5015 echo "master directly after the master LMV EA lost, then the LFSCK"
5016 echo "should re-generate the master LMV EA."
5019 check_mount_and_prep
5021 echo "Inject failure stub on MDT0 to simulate the case that the"
5022 echo "master MDT-object of the striped directory lost the LMV EA."
5024 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5026 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5027 error "(1) Fail to create striped directory"
5028 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5030 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5031 $START_NAMESPACE -r -A ||
5032 error "(2) Fail to start LFSCK for namespace"
5034 wait_all_targets_blocked namespace completed 3
5036 local repaired=$($SHOW_NAMESPACE |
5037 awk '/^striped_dirs_repaired/ { print $2 }')
5038 [ $repaired -eq 1 ] ||
5039 error "(4) Fail to re-generate master LMV EA: $repaired"
5041 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5042 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5044 umount_client $MOUNT || error "(5) umount failed"
5045 mount_client $MOUNT || error "(6) mount failed"
5047 local empty=$(ls $DIR/$tdir/striped_dir/)
5048 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5050 rmdir $DIR/$tdir/striped_dir ||
5051 error "(8) Fail to remove the striped directory after LFSCK"
5053 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5056 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5057 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5058 skip "MDS older than 2.6.50, LU-5519"
5061 echo "For some reason, the master MDT-object of the striped directory"
5062 echo "may lost its master LMV EA. If somebody created files under the"
5063 echo "master directly after the master LMV EA lost, then the LFSCK"
5064 echo "should NOT re-generate the master LMV EA, instead, it should"
5065 echo "change the broken striped dirctory as read-only to prevent"
5066 echo "further damage"
5069 check_mount_and_prep
5071 echo "Inject failure stub on MDT0 to simulate the case that the"
5072 echo "master MDT-object of the striped directory lost the LMV EA."
5074 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5075 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5076 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5077 error "(1) Fail to create striped directory"
5078 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5080 umount_client $MOUNT || error "(2) umount failed"
5082 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5083 mount_client $MOUNT || error "(3) mount failed"
5085 touch $DIR/$tdir/striped_dir/dummy ||
5086 error "(4) Fail to touch under broken striped directory"
5088 echo "Trigger namespace LFSCK to find out the inconsistency"
5089 $START_NAMESPACE -r -A ||
5090 error "(5) Fail to start LFSCK for namespace"
5092 wait_all_targets_blocked namespace completed 6
5094 local repaired=$($SHOW_NAMESPACE |
5095 awk '/^striped_dirs_repaired/ { print $2 }')
5096 [ $repaired -eq 0 ] ||
5097 error "(7) Re-generate master LMV EA unexpected: $repaired"
5099 stat $DIR/$tdir/striped_dir/dummy ||
5100 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5102 touch $DIR/$tdir/striped_dir/foo &&
5103 error "(9) The broken striped directory should be read-only"
5105 chattr -i $DIR/$tdir/striped_dir ||
5106 error "(10) Fail to chattr on the broken striped directory"
5108 rm -f $DIR/$tdir/striped_dir/dummy || error "(11) Fail to remove dummy"
5110 # LFSCK again to regenerate master LMV
5111 echo "Trigger namespace LFSCK to find out the inconsistency"
5112 $START_NAMESPACE -r -A ||
5113 error "(12) Fail to start LFSCK for namespace"
5115 wait_all_targets_blocked namespace completed 6
5117 # reload striped_dir to parse newly generated LMV
5119 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5121 rmdir $DIR/$tdir/striped_dir ||
5122 error "(13) Fail to remove the striped directory after LFSCK"
5124 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5127 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5128 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5129 skip "MDS older than 2.6.50, LU-5519"
5132 echo "For some reason, the slave MDT-object of the striped directory"
5133 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5134 echo "slave LMV EA."
5137 check_mount_and_prep
5139 echo "Inject failure stub on MDT0 to simulate the case that the"
5140 echo "slave MDT-object (that resides on the same MDT as the master"
5141 echo "MDT-object resides on) lost the LMV EA."
5143 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5145 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5146 error "(1) Fail to create striped directory"
5147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5149 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5150 $START_NAMESPACE -r -A ||
5151 error "(2) Fail to start LFSCK for namespace"
5153 wait_all_targets_blocked namespace completed 3
5155 local repaired=$($SHOW_NAMESPACE |
5156 awk '/^striped_shards_repaired/ { print $2 }')
5157 [ $repaired -eq 1 ] ||
5158 error "(4) Fail to re-generate slave LMV EA: $repaired"
5160 rmdir $DIR/$tdir/striped_dir ||
5161 error "(5) Fail to remove the striped directory after LFSCK"
5163 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5166 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5167 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5168 skip "MDS older than 2.6.50, LU-5519"
5171 echo "For some reason, the slave MDT-object of the striped directory"
5172 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5173 echo "slave LMV EA."
5176 check_mount_and_prep
5178 echo "Inject failure stub on MDT0 to simulate the case that the"
5179 echo "slave MDT-object (that resides on different MDT as the master"
5180 echo "MDT-object resides on) lost the LMV EA."
5182 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5184 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5185 error "(1) Fail to create striped directory"
5186 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5188 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5189 $START_NAMESPACE -r -A ||
5190 error "(2) Fail to start LFSCK for namespace"
5192 wait_all_targets_blocked namespace completed 3
5194 local repaired=$(do_facet mds2 $LCTL get_param -n \
5195 mdd.$(facet_svc mds2).lfsck_namespace |
5196 awk '/^striped_shards_repaired/ { print $2 }')
5197 [ $repaired -eq 1 ] ||
5198 error "(4) Fail to re-generate slave LMV EA: $repaired"
5200 rmdir $DIR/$tdir/striped_dir ||
5201 error "(5) Fail to remove the striped directory after LFSCK"
5203 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5206 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5207 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5208 skip "MDS older than 2.6.50, LU-5519"
5211 echo "For some reason, the stripe index in the slave LMV EA is"
5212 echo "corrupted. The LFSCK should repair the slave LMV EA."
5215 check_mount_and_prep
5217 echo "Inject failure stub on MDT0 to simulate the case that the"
5218 echo "slave LMV EA on the first shard of the striped directory"
5219 echo "claims the same index as the second shard claims"
5221 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5223 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5224 error "(1) Fail to create striped directory"
5225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5227 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5228 $START_NAMESPACE -r -A ||
5229 error "(2) Fail to start LFSCK for namespace"
5231 wait_all_targets_blocked namespace completed 3
5233 local repaired=$($SHOW_NAMESPACE |
5234 awk '/^striped_shards_repaired/ { print $2 }')
5235 [ $repaired -eq 1 ] ||
5236 error "(4) Fail to repair slave LMV EA: $repaired"
5238 umount_client $MOUNT || error "(5) umount failed"
5239 mount_client $MOUNT || error "(6) mount failed"
5241 touch $DIR/$tdir/striped_dir/foo ||
5242 error "(7) Fail to touch file after the LFSCK"
5244 rm -f $DIR/$tdir/striped_dir/foo ||
5245 error "(8) Fail to unlink file after the LFSCK"
5247 rmdir $DIR/$tdir/striped_dir ||
5248 error "(9) Fail to remove the striped directory after LFSCK"
5250 run_test 31g "Repair the corrupted slave LMV EA"
5253 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5254 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5255 skip "MDS older than 2.6.50, LU-5519"
5258 echo "For some reason, the shard's name entry in the striped"
5259 echo "directory may be corrupted. The LFSCK should repair the"
5260 echo "bad shard's name entry."
5263 check_mount_and_prep
5265 echo "Inject failure stub on MDT0 to simulate the case that the"
5266 echo "first shard's name entry in the striped directory claims"
5267 echo "the same index as the second shard's name entry claims."
5269 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5271 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5272 error "(1) Fail to create striped directory"
5273 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5275 echo "Trigger namespace LFSCK to repair the shard's name entry"
5276 $START_NAMESPACE -r -A ||
5277 error "(2) Fail to start LFSCK for namespace"
5279 wait_all_targets_blocked namespace completed 3
5281 local repaired=$($SHOW_NAMESPACE |
5282 awk '/^dirent_repaired/ { print $2 }')
5283 [ $repaired -eq 1 ] ||
5284 error "(4) Fail to repair shard's name entry: $repaired"
5286 umount_client $MOUNT || error "(5) umount failed"
5287 mount_client $MOUNT || error "(6) mount failed"
5289 touch $DIR/$tdir/striped_dir/foo ||
5290 error "(7) Fail to touch file after the LFSCK"
5292 rm -f $DIR/$tdir/striped_dir/foo ||
5293 error "(8) Fail to unlink file after the LFSCK"
5295 rmdir $DIR/$tdir/striped_dir ||
5296 error "(9) Fail to remove the striped directory after LFSCK"
5298 run_test 31h "Repair the corrupted shard's name entry"
5303 umount_client $MOUNT
5305 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5306 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5307 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5309 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5310 [ "$STATUS" == "scanning-phase1" ] ||
5311 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5314 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5320 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5322 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5323 error "(5) Fail to start ost1"
5325 run_test 32a "stop LFSCK when some OST failed"
5329 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5332 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5333 error "(1) Fail to create $DIR/$tdir/dp"
5334 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5335 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5336 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5337 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5338 umount_client $MOUNT
5340 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5341 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5342 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5344 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5345 mdd.${MDT_DEV}.lfsck_namespace |
5346 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5348 error "(5) unexpected status"
5352 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5354 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5358 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5360 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5361 error "(8) Fail to start MDT2"
5363 run_test 32b "stop LFSCK when some MDT failed"
5369 $START_LAYOUT --dryrun -o -r ||
5370 error "(1) Fail to start layout LFSCK"
5371 wait_all_targets_blocked layout completed 2
5373 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5374 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5375 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5377 $START_NAMESPACE -e abort -A -r ||
5378 error "(4) Fail to start namespace LFSCK"
5379 wait_all_targets_blocked namespace completed 5
5381 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5382 [ "$PARAMS" == "failout,all_targets" ] ||
5383 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5385 run_test 33 "check LFSCK paramters"
5389 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5390 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5394 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5396 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5397 error "(1) Fail to create $DIR/$tdir/dummy"
5399 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5400 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5402 mdd.${MDT_DEV}.lfsck_namespace |
5403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5405 error "(3) unexpected status"
5408 local repaired=$($SHOW_NAMESPACE |
5409 awk '/^dirent_repaired/ { print $2 }')
5410 [ $repaired -eq 1 ] ||
5411 error "(4) Fail to repair the lost agent object: $repaired"
5413 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5414 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5415 mdd.${MDT_DEV}.lfsck_namespace |
5416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5418 error "(6) unexpected status"
5421 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5422 [ $repaired -eq 0 ] ||
5423 error "(7) Unexpected repairing: $repaired"
5425 run_test 34 "LFSCK can rebuild the lost agent object"
5429 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5433 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5434 do_facet mds2 $LCTL set_param fail_loc=0x1631
5435 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5436 error "(1) Fail to create $DIR/$tdir/dummy"
5439 do_facet mds2 $LCTL set_param fail_loc=0
5440 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5441 wait_update_facet mds2 "$LCTL get_param -n \
5442 mdd.$(facet_svc mds2).lfsck_namespace |
5443 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5444 error "(3) MDS${k} is not the expected 'completed'"
5446 local repaired=$(do_facet mds2 $LCTL get_param -n \
5447 mdd.$(facet_svc mds2).lfsck_namespace |
5448 awk '/^agent_entries_repaired/ { print $2 }')
5449 [ $repaired -eq 1 ] ||
5450 error "(4) Fail to repair the lost agent entry: $repaired"
5452 echo "stopall to cleanup object cache"
5455 setupall > /dev/null
5457 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5458 wait_update_facet mds2 "$LCTL get_param -n \
5459 mdd.$(facet_svc mds2).lfsck_namespace |
5460 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5461 error "(6) MDS${k} is not the expected 'completed'"
5463 repaired=$(do_facet mds2 $LCTL get_param -n \
5464 mdd.$(facet_svc mds2).lfsck_namespace |
5465 awk '/^agent_entries_repaired/ { print $2 }')
5466 [ $repaired -eq 0 ] ||
5467 error "(7) Unexpected repairing: $repaired"
5469 run_test 35 "LFSCK can rebuild the lost agent entry"
5472 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5475 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5476 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5477 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5480 check_mount_and_prep
5484 lctl get_param osc.*.*grant*
5485 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5487 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5488 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5489 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5490 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5491 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5492 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5493 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5494 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5495 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5497 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5498 error "(3) Fail to write $DIR/$tdir/f0"
5499 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5500 error "(4) Fail to write $DIR/$tdir/f1"
5501 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5502 error "(5) Fail to write $DIR/$tdir/f2"
5504 $LFS mirror resync $DIR/$tdir/f0 ||
5505 error "(6) Fail to resync $DIR/$tdir/f0"
5506 $LFS mirror resync $DIR/$tdir/f1 ||
5507 error "(7) Fail to resync $DIR/$tdir/f1"
5508 $LFS mirror resync $DIR/$tdir/f2 ||
5509 error "(8) Fail to resync $DIR/$tdir/f2"
5511 cancel_lru_locks mdc
5512 cancel_lru_locks osc
5514 $LFS getstripe $DIR/$tdir/f0 ||
5515 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5516 $LFS getstripe $DIR/$tdir/f1 ||
5517 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5518 $LFS getstripe $DIR/$tdir/f2 ||
5519 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5521 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5522 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5523 do_facet mds1 $LCTL set_param fail_loc=0x1616
5525 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5526 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5527 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5528 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5529 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5530 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5534 do_facet mds1 $LCTL set_param fail_loc=0
5536 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5537 error "(15) The 1st of mirror is not destroyed"
5538 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5539 error "(16) The 2nd of mirror is not destroyed"
5540 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5541 error "(17) The 3rd of mirror is not destroyed"
5545 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5546 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5547 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5548 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5549 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5550 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5552 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5553 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5555 for k in $(seq $MDSCOUNT); do
5556 # The LFSCK status query internal is 30 seconds. For the case
5557 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5558 # time to guarantee the status sync up.
5559 wait_update_facet mds${k} "$LCTL get_param -n \
5560 mdd.$(facet_svc mds${k}).lfsck_layout |
5561 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5562 error "(22) MDS${k} is not the expected 'completed'"
5565 for k in $(seq $OSTCOUNT); do
5566 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5567 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5568 awk '/^status/ { print $2 }')
5569 [ "$cur_status" == "completed" ] ||
5570 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5573 local repaired=$(do_facet mds1 $LCTL get_param -n \
5574 mdd.$(facet_svc mds1).lfsck_layout |
5575 awk '/^repaired_orphan/ { print $2 }')
5576 [ $repaired -eq 9 ] ||
5577 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5579 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5580 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5581 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5582 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5583 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5584 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5586 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5587 $LFS getstripe $DIR/$tdir/f0
5588 error "(28) The 1st of mirror is not recovered"
5591 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5592 $LFS getstripe $DIR/$tdir/f1
5593 error "(29) The 2nd of mirror is not recovered"
5596 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5597 $LFS getstripe $DIR/$tdir/f2
5598 error "(30) The 3rd of mirror is not recovered"
5601 run_test 36a "rebuild LOV EA for mirrored file (1)"
5604 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5605 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5608 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5609 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5610 echo "with the PFID EA of related OST-object(s) belong to the file. "
5613 check_mount_and_prep
5615 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5616 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5617 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5619 local fid=$($LFS path2fid $DIR/$tdir/f0)
5621 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5622 error "(1) Fail to write $DIR/$tdir/f0"
5623 $LFS mirror resync $DIR/$tdir/f0 ||
5624 error "(2) Fail to resync $DIR/$tdir/f0"
5626 cancel_lru_locks mdc
5627 cancel_lru_locks osc
5629 $LFS getstripe $DIR/$tdir/f0 ||
5630 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5632 echo "Inject failure, to simulate the case of missing the MDT-object"
5633 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5634 do_facet mds1 $LCTL set_param fail_loc=0x1616
5635 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5639 do_facet mds1 $LCTL set_param fail_loc=0
5641 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5642 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5644 for k in $(seq $MDSCOUNT); do
5645 # The LFSCK status query internal is 30 seconds. For the case
5646 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5647 # time to guarantee the status sync up.
5648 wait_update_facet mds${k} "$LCTL get_param -n \
5649 mdd.$(facet_svc mds${k}).lfsck_layout |
5650 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5651 error "(6) MDS${k} is not the expected 'completed'"
5654 for k in $(seq $OSTCOUNT); do
5655 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5656 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5657 awk '/^status/ { print $2 }')
5658 [ "$cur_status" == "completed" ] ||
5659 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5662 local count=$(do_facet mds1 $LCTL get_param -n \
5663 mdd.$(facet_svc mds1).lfsck_layout |
5664 awk '/^repaired_orphan/ { print $2 }')
5665 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5667 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5668 count=$($LFS getstripe --mirror-count $name)
5669 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5671 count=$($LFS getstripe --component-count $name)
5672 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5674 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5675 $LFS getstripe $name
5676 error "(11) The 1st of mirror is not recovered"
5679 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5680 $LFS getstripe $name
5681 error "(12) The 2nd of mirror is not recovered"
5684 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5685 $LFS getstripe $name
5686 error "(13) The 3rd of mirror is not recovered"
5689 run_test 36b "rebuild LOV EA for mirrored file (2)"
5692 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5693 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5696 echo "The mirrored file has been modified, not resynced yet, then "
5697 echo "lost its MDT-object, but relatd OST-objects are still there. "
5698 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5699 echo "with the PFID EA of related OST-object(s) belong to the file. "
5702 check_mount_and_prep
5704 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5706 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5708 local fid=$($LFS path2fid $DIR/$tdir/f0)
5710 # The 1st dd && resync makes all related OST-objects have been written
5711 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5712 error "(1.1) Fail to write $DIR/$tdir/f0"
5713 $LFS mirror resync $DIR/$tdir/f0 ||
5714 error "(1.2) Fail to resync $DIR/$tdir/f0"
5715 # The 2nd dd makes one mirror to be stale
5716 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5717 error "(1.3) Fail to write $DIR/$tdir/f0"
5719 cancel_lru_locks mdc
5720 cancel_lru_locks osc
5722 $LFS getstripe $DIR/$tdir/f0 ||
5723 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5725 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5726 awk '/lcme_flags/ { print $2 }')
5727 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5728 awk '/lcme_flags/ { print $2 }')
5730 echo "Inject failure, to simulate the case of missing the MDT-object"
5731 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5732 do_facet mds1 $LCTL set_param fail_loc=0x1616
5733 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5737 do_facet mds1 $LCTL set_param fail_loc=0
5739 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5740 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5742 for k in $(seq $MDSCOUNT); do
5743 # The LFSCK status query internal is 30 seconds. For the case
5744 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5745 # time to guarantee the status sync up.
5746 wait_update_facet mds${k} "$LCTL get_param -n \
5747 mdd.$(facet_svc mds${k}).lfsck_layout |
5748 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5749 error "(5) MDS${k} is not the expected 'completed'"
5752 for k in $(seq $OSTCOUNT); do
5753 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5754 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5755 awk '/^status/ { print $2 }')
5756 [ "$cur_status" == "completed" ] ||
5757 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5760 local count=$(do_facet mds1 $LCTL get_param -n \
5761 mdd.$(facet_svc mds1).lfsck_layout |
5762 awk '/^repaired_orphan/ { print $2 }')
5763 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5765 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5766 count=$($LFS getstripe --mirror-count $name)
5767 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5769 count=$($LFS getstripe --component-count $name)
5770 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5772 local flags=$($LFS getstripe $name | head -n 10 |
5773 awk '/lcme_flags/ { print $2 }')
5774 [ "$flags" == "$saved_flags1" ] || {
5775 $LFS getstripe $name
5776 error "(10) expect flags $saved_flags1, got $flags"
5779 flags=$($LFS getstripe $name | tail -n 10 |
5780 awk '/lcme_flags/ { print $2 }')
5781 [ "$flags" == "$saved_flags2" ] || {
5782 $LFS getstripe $name
5783 error "(11) expect flags $saved_flags2, got $flags"
5786 run_test 36c "rebuild LOV EA for mirrored file (3)"
5792 local t_dir="$DIR/$tdir/d0"
5793 check_mount_and_prep
5795 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5796 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5800 $START_NAMESPACE -r -A || {
5801 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5803 wait_all_targets_blocked namespace completed 4
5808 run_test 37 "LFSCK must skip a ORPHAN"
5812 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5813 skip "Need MDS version newer than 2.12.51"
5815 test_mkdir $DIR/$tdir
5816 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5817 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5819 # create foreign file
5820 $LFS setstripe --foreign=none --flags 0xda05 \
5821 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5822 error "$DIR/$tdir/$tfile: create failed"
5824 $LFS getstripe -v $DIR/$tdir/$tfile |
5825 grep "lfm_magic:.*0x0BD70BD0" ||
5826 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5827 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5828 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5829 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5830 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5831 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5832 $LFS getstripe -v $DIR/$tdir/$tfile |
5833 grep "lfm_flags:.*0x0000DA05" ||
5834 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5835 $LFS getstripe $DIR/$tdir/$tfile |
5836 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5837 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5839 # modify striping should fail
5840 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5841 error "$DIR/$tdir/$tfile: setstripe should fail"
5843 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5845 wait_all_targets_blocked namespace completed 1
5847 # check that "global" namespace_repaired == 0 !!!
5848 local repaired=$(do_facet mds1 \
5849 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5850 awk '/^namespace_repaired/ { print \\\$2 }'")
5851 [ $repaired -eq 0 ] ||
5852 error "(2) Expect no namespace repair, but got: $repaired"
5854 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5856 wait_all_targets_blocked layout completed 2
5858 # check that "global" layout_repaired == 0 !!!
5859 local repaired=$(do_facet mds1 \
5860 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5861 awk '/^layout_repaired/ { print \\\$2 }'")
5862 [ $repaired -eq 0 ] ||
5863 error "(2) Expect no layout repair, but got: $repaired"
5865 echo "post-lfsck checks of foreign file"
5867 $LFS getstripe -v $DIR/$tdir/$tfile |
5868 grep "lfm_magic:.*0x0BD70BD0" ||
5869 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5870 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5871 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5872 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5873 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5874 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5875 $LFS getstripe -v $DIR/$tdir/$tfile |
5876 grep "lfm_flags:.*0x0000DA05" ||
5877 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5878 $LFS getstripe $DIR/$tdir/$tfile |
5879 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5880 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5882 # modify striping should fail
5883 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5884 error "$DIR/$tdir/$tfile: setstripe should fail"
5887 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5888 cat /etc/passwd > $DIR/$tdir/$tfile &&
5889 error "$DIR/$tdir/$tfile: write should fail"
5891 #remove foreign file
5892 rm $DIR/$tdir/$tfile ||
5893 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5895 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5899 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5900 skip "Need MDS version newer than 2.12.51"
5902 test_mkdir $DIR/$tdir
5903 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5904 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5906 # create foreign dir
5907 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5908 $DIR/$tdir/${tdir}2 ||
5909 error "$DIR/$tdir/${tdir}2: create failed"
5911 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5912 grep "lfm_magic:.*0x0CD50CD0" ||
5913 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5914 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5915 # - sizeof(lfm_type) - sizeof(lfm_flags)
5916 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5917 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5918 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5919 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5920 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5921 grep "lfm_flags:.*0x0000DA05" ||
5922 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5923 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5924 grep "lfm_value.*${uuid1}@${uuid2}" ||
5925 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5927 # file create in dir should fail
5928 touch $DIR/$tdir/${tdir}2/$tfile &&
5929 "$DIR/${tdir}2: file create should fail"
5932 chmod 777 $DIR/$tdir/${tdir}2 ||
5933 error "$DIR/${tdir}2: chmod failed"
5936 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5937 error "$DIR/${tdir}2: chown failed"
5939 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5941 wait_all_targets_blocked namespace completed 1
5943 # check that "global" namespace_repaired == 0 !!!
5944 local repaired=$(do_facet mds1 \
5945 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5946 awk '/^namespace_repaired/ { print \\\$2 }'")
5947 [ $repaired -eq 0 ] ||
5948 error "(2) Expect nothing to be repaired, but got: $repaired"
5950 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5952 wait_all_targets_blocked layout completed 2
5954 # check that "global" layout_repaired == 0 !!!
5955 local repaired=$(do_facet mds1 \
5956 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5957 awk '/^layout_repaired/ { print \\\$2 }'")
5958 [ $repaired -eq 0 ] ||
5959 error "(2) Expect no layout repair, but got: $repaired"
5961 echo "post-lfsck checks of foreign dir"
5963 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5964 grep "lfm_magic:.*0x0CD50CD0" ||
5965 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5966 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5967 # - sizeof(lfm_type) - sizeof(lfm_flags)
5968 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5969 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5970 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5971 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5972 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5973 grep "lfm_flags:.*0x0000DA05" ||
5974 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5975 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5976 grep "lfm_value.*${uuid1}@${uuid2}" ||
5977 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5979 # file create in dir should fail
5980 touch $DIR/$tdir/${tdir}2/$tfile &&
5981 "$DIR/${tdir}2: file create should fail"
5984 chmod 777 $DIR/$tdir/${tdir}2 ||
5985 error "$DIR/${tdir}2: chmod failed"
5988 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5989 error "$DIR/${tdir}2: chown failed"
5992 rmdir $DIR/$tdir/${tdir}2 ||
5993 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5995 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5998 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6000 check_mount_and_prep
6001 $LFS mkdir -i 1 $DIR/$tdir/dir1
6002 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6004 touch $DIR/$tdir/dir1/f1
6005 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6007 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6008 $LFS migrate -m 0 $DIR/$tdir/dir1
6010 echo "trigger LFSCK for layout"
6011 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6013 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6014 mdd.${MDT_DEV}.lfsck_layout |
6015 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6017 error "(2) unexpected status"
6020 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6022 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6024 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6028 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6030 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6031 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6032 do_facet $SINGLEMDS $LCTL dk > /dev/null
6034 echo "trigger LFSCK for SEL layout"
6035 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6036 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6037 mdd.${MDT_DEV}.lfsck_layout |
6038 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6040 error "(2) unexpected status"
6043 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6044 grep "lfsck_layout_verify_header")
6046 [[ "x$errors" == "x" ]] || {
6048 error "lfsck failed"
6051 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6053 run_test 41 "SEL support in LFSCK"
6056 local mode='\x00\x00\x00\x00'
6057 local raw="$(printf ""\\\\x%02x"" {0..63})"
6061 [[ $(lscpu) =~ Byte\ Order.*Little ]] && size='\x40\x00\x00\x00' ||
6062 size='\x00\x00\x00\x40'
6063 key="${mode}${raw}${size}"
6064 echo -n -e "${key}" | keyctl padd logon fscrypt:4242424242424242 @s
6069 sync ; echo 3 > /proc/sys/vm/drop_caches
6076 $LCTL set_param -n ldlm.namespaces.*.lru_size=clear
6077 sync ; echo 3 > /proc/sys/vm/drop_caches
6078 dummy_key=$(keyctl show | awk '$7 ~ "^fscrypt:" {print $1}')
6079 if [ -n "$dummy_key" ]; then
6080 keyctl revoke $dummy_key
6085 remount_client_normally() {
6086 # remount client without dummy encryption key
6087 if is_mounted $MOUNT; then
6088 umount_client $MOUNT || error "umount $MOUNT failed"
6090 mount_client $MOUNT ${MOUNT_OPTS} ||
6091 error "remount failed"
6093 if is_mounted $MOUNT2; then
6094 umount_client $MOUNT2 || error "umount $MOUNT2 failed"
6096 if [ "$MOUNT_2" ]; then
6097 mount_client $MOUNT2 ${MOUNT_OPTS} ||
6098 error "remount failed"
6104 remount_client_dummykey() {
6107 # remount client with dummy encryption key
6108 if is_mounted $MOUNT; then
6109 umount_client $MOUNT || error "umount $MOUNT failed"
6111 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6112 error "remount failed"
6115 setup_for_enc_tests() {
6116 rm -rf $DIR/[df][0-9]* || error "Fail to cleanup env"
6118 # remount client with test_dummy_encryption option
6119 if is_mounted $MOUNT; then
6120 umount_client $MOUNT || error "umount $MOUNT failed"
6122 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6123 error "mount with '-o test_dummy_encryption' failed"
6125 # this directory will be encrypted, because of dummy mode
6126 $LFS setdirstripe -c 1 -i 0 $DIR/$tdir
6127 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6130 cleanup_for_enc_tests() {
6131 rm -rf $DIR/$tdir $*
6133 remount_client_normally
6137 [[ $(facet_fstype ost1) == zfs ]] && skip "skip ZFS backend"
6139 (( $MDS1_VERSION > $(version_code 2.15.51) )) ||
6140 skip "Need MDS version at least 2.15.51"
6143 echo "If the MDT-object has the encryption flag but the OST-object"
6144 echo "does not, add it to the OST-object."
6147 check_mount_and_prep
6149 $LCTL get_param mdc.*.import | grep -q client_encryption ||
6150 skip "client encryption not supported"
6152 mount.lustre --help |& grep -q "test_dummy_encryption:" ||
6153 skip "need dummy encryption support"
6155 stack_trap cleanup_for_enc_tests EXIT
6158 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6159 touch $DIR/$tdir/${tfile}_1 || error "touch ${tfile}_1 failed"
6160 dd if=/dev/zero of=$DIR/$tdir/${tfile}_2 bs=1 count=1 conv=fsync ||
6161 error "dd ${tfile}_2 failed"
6163 #define OBD_FAIL_LFSCK_NO_ENCFLAG 0x1632
6164 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x1632"
6165 touch $DIR/$tdir/${tfile}_3 || error "touch ${tfile}_3 failed"
6166 dd if=/dev/zero of=$DIR/$tdir/${tfile}_4 bs=1 count=1 conv=fsync ||
6167 error "dd ${tfile}_4 failed"
6168 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x0"
6169 cancel_lru_locks osc
6171 echo "Trigger layout LFSCK to find out inconsistent OST-object enc flag"
6173 $START_LAYOUT -r || error "Fail to start LFSCK for layout!"
6175 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6176 mdd.${MDT_DEV}.lfsck_layout |
6177 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6179 error "unexpected lfsck status"
6182 local repaired=$($SHOW_LAYOUT |
6183 awk '/^repaired_others/ { print $2 }')
6184 [ $repaired -eq 2 ] ||
6185 error "Fail to repair inconsistent enc flag: $repaired"
6187 run_test 42 "LFSCK can repair inconsistent MDT-object/OST-object encryption flags"
6189 # restore MDS/OST size
6190 MDSSIZE=${SAVED_MDSSIZE}
6191 OSTSIZE=${SAVED_OSTSIZE}
6192 OSTCOUNT=${SAVED_OSTCOUNT}
6194 # cleanup the system at last
6195 REFORMAT="yes" cleanup_and_setup_lustre
6197 complete_test $SECONDS
6198 check_and_cleanup_lustre