3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
345 touch $DIR/$tdir/dummy
347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
349 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
350 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
351 mdd.${MDT_DEV}.lfsck_namespace |
352 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
354 error "(4) unexpected status"
357 local repaired=$($SHOW_NAMESPACE |
358 awk '/^linkea_repaired/ { print $2 }')
359 # for interop with old server
360 [ -z "$repaired" ] &&
361 repaired=$($SHOW_NAMESPACE |
362 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
367 run_e2fsck_on_mds_facet $SINGLEMDS
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
404 run_e2fsck_on_mds_facet $SINGLEMDS
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
420 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
421 skip "MDS older than 2.4.90"
425 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
427 touch $DIR/$tdir/dummy
429 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
431 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
432 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
433 mdd.${MDT_DEV}.lfsck_namespace |
434 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
436 error "(4) unexpected status"
439 local repaired=$($SHOW_NAMESPACE |
440 awk '/^updated_phase2/ { print $2 }')
441 [ $repaired -eq 1 ] ||
442 error "(5) Fail to repair crashed linkEA: $repaired"
444 run_e2fsck_on_mds_facet $SINGLEMDS
446 mount_client $MOUNT || error "(6) Fail to start client!"
448 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
449 error "(7) Fail to stat $DIR/$tdir/dummy"
451 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
452 local dummyname=$($LFS fid2path $DIR $dummyfid)
453 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
454 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
456 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
460 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
461 skip "MDS older than 2.6.50, LU-4788"
465 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
467 touch $DIR/$tdir/dummy
469 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
471 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
472 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
473 mdd.${MDT_DEV}.lfsck_namespace |
474 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
476 error "(4) unexpected status"
479 local repaired=$($SHOW_NAMESPACE |
480 awk '/^linkea_repaired/ { print $2 }')
481 [ $repaired -eq 1 ] ||
482 error "(5) Fail to repair crashed linkEA: $repaired"
484 run_e2fsck_on_mds_facet $SINGLEMDS
486 mount_client $MOUNT || error "(6) Fail to start client!"
488 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
489 error "(7) Fail to stat $DIR/$tdir/dummy"
491 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
492 local dummyname=$($LFS fid2path $DIR $dummyfid)
493 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
494 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
496 run_test 2d "LFSCK can recover the missing linkEA entry"
500 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
501 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
502 skip "MDS older than 2.6.50, LU-5511"
506 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
508 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
509 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
510 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
511 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
513 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
515 wait_all_targets_blocked namespace completed 4
517 local repaired=$($SHOW_NAMESPACE |
518 awk '/^linkea_repaired/ { print $2 }')
519 [ $repaired -eq 1 ] ||
520 error "(5) Fail to repair crashed linkEA: $repaired"
522 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
523 local name=$($LFS fid2path $DIR $fid)
524 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
525 error "(6) Fail to repair linkEA: $fid $name"
527 run_test 2e "namespace LFSCK can verify remote object linkEA"
531 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
532 skip "MDS older than 2.6.50, LU-4788"
536 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
537 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
538 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
540 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
541 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
542 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
544 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
545 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
546 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
548 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
549 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
550 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
552 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
554 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
555 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
556 mdd.${MDT_DEV}.lfsck_namespace |
557 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
559 error "(10) unexpected status"
562 local checked=$($SHOW_NAMESPACE |
563 awk '/^checked_phase2/ { print $2 }')
564 [ $checked -ge 4 ] ||
565 error "(11) Fail to check multiple-linked object: $checked"
567 local repaired=$($SHOW_NAMESPACE |
568 awk '/^multiple_linked_repaired/ { print $2 }')
569 [ $repaired -ge 2 ] ||
570 error "(12) Fail to repair multiple-linked object: $repaired"
572 run_test 3 "LFSCK can verify multiple-linked objects"
576 [ "$mds1_FSTYPE" != ldiskfs ] &&
577 skip "OI Scrub not implemented for ZFS"
580 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
581 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
583 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
584 echo "start $SINGLEMDS with disabling OI scrub"
585 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
587 #define OBD_FAIL_LFSCK_DELAY2 0x1601
588 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
589 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
590 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
591 mdd.${MDT_DEV}.lfsck_namespace |
592 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
594 error "(5) unexpected status"
597 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
598 [ "$STATUS" == "scanning-phase1" ] ||
599 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
601 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
602 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
603 mdd.${MDT_DEV}.lfsck_namespace |
604 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
606 error "(7) unexpected status"
609 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
610 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
612 local repaired=$($SHOW_NAMESPACE |
613 awk '/^dirent_repaired/ { print $2 }')
614 # for interop with old server
615 [ -z "$repaired" ] &&
616 repaired=$($SHOW_NAMESPACE |
617 awk '/^updated_phase1/ { print $2 }')
619 [ $repaired -ge 9 ] ||
620 error "(9) Fail to re-generate FID-in-dirent: $repaired"
622 run_e2fsck_on_mds_facet $SINGLEMDS
624 mount_client $MOUNT || error "(10) Fail to start client!"
626 #define OBD_FAIL_FID_LOOKUP 0x1505
627 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
628 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
631 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
635 [ "$mds1_FSTYPE" != ldiskfs ] &&
636 skip "OI Scrub not implemented for ZFS"
639 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
640 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
642 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
643 echo "start $SINGLEMDS with disabling OI scrub"
644 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
646 #define OBD_FAIL_LFSCK_DELAY2 0x1601
647 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
648 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
649 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
650 mdd.${MDT_DEV}.lfsck_namespace |
651 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
653 error "(5) unexpected status"
656 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
657 [ "$STATUS" == "scanning-phase1" ] ||
658 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
661 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
662 mdd.${MDT_DEV}.lfsck_namespace |
663 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
665 error "(7) unexpected status"
668 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
669 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
671 local repaired=$($SHOW_NAMESPACE |
672 awk '/^dirent_repaired/ { print $2 }')
673 # for interop with old server
674 [ -z "$repaired" ] &&
675 repaired=$($SHOW_NAMESPACE |
676 awk '/^updated_phase1/ { print $2 }')
678 [ $repaired -ge 2 ] ||
679 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
681 run_e2fsck_on_mds_facet $SINGLEMDS
683 mount_client $MOUNT || error "(10) Fail to start client!"
685 #define OBD_FAIL_FID_LOOKUP 0x1505
686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
687 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
689 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
692 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
693 local dummyname=$($LFS fid2path $DIR $dummyfid)
694 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
695 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
697 run_test 5 "LFSCK can handle IGIF object upgrading"
702 #define OBD_FAIL_LFSCK_DELAY1 0x1600
703 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
704 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
706 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
707 [ "$STATUS" == "scanning-phase1" ] ||
708 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
710 # Sleep 3 sec to guarantee at least one object processed by LFSCK
712 # Fail the LFSCK to guarantee there is at least one checkpoint
713 #define OBD_FAIL_LFSCK_FATAL1 0x1608
714 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
715 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
716 mdd.${MDT_DEV}.lfsck_namespace |
717 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
719 error "(4) unexpected status"
722 local POS0=$($SHOW_NAMESPACE |
723 awk '/^last_checkpoint_position/ { print $2 }' |
726 #define OBD_FAIL_LFSCK_DELAY1 0x1600
727 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
728 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
730 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
731 [ "$STATUS" == "scanning-phase1" ] ||
732 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
734 local POS1=$($SHOW_NAMESPACE |
735 awk '/^latest_start_position/ { print $2 }' |
737 [[ $POS0 -lt $POS1 ]] ||
738 error "(7) Expect larger than: $POS0, but got $POS1"
740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
741 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
742 mdd.${MDT_DEV}.lfsck_namespace |
743 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
745 error "(8) unexpected status"
748 run_test 6a "LFSCK resumes from last checkpoint (1)"
753 #define OBD_FAIL_LFSCK_DELAY2 0x1601
754 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
755 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
757 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
758 [ "$STATUS" == "scanning-phase1" ] ||
759 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
761 # Sleep 5 sec to guarantee that we are in the directory scanning
763 # Fail the LFSCK to guarantee there is at least one checkpoint
764 #define OBD_FAIL_LFSCK_FATAL2 0x1609
765 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
767 mdd.${MDT_DEV}.lfsck_namespace |
768 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
770 error "(4) unexpected status"
773 local O_POS0=$($SHOW_NAMESPACE |
774 awk '/^last_checkpoint_position/ { print $2 }' |
777 local D_POS0=$($SHOW_NAMESPACE |
778 awk '/^last_checkpoint_position/ { print $4 }')
780 #define OBD_FAIL_LFSCK_DELAY2 0x1601
781 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
782 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
784 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
785 [ "$STATUS" == "scanning-phase1" ] ||
786 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
788 local O_POS1=$($SHOW_NAMESPACE |
789 awk '/^latest_start_position/ { print $2 }' |
791 local D_POS1=$($SHOW_NAMESPACE |
792 awk '/^latest_start_position/ { print $4 }')
794 echo "Additional debug for 6b"
796 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
797 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
798 [[ $O_POS0 -lt $O_POS1 ]] ||
799 error "(7.1) $O_POS1 is not larger than $O_POS0"
801 [[ $D_POS0 -lt $D_POS1 ]] ||
802 error "(7.2) $D_POS1 is not larger than $D_POS0"
805 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
806 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
807 mdd.${MDT_DEV}.lfsck_namespace |
808 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
810 error "(8) unexpected status"
813 run_test 6b "LFSCK resumes from last checkpoint (2)"
820 #define OBD_FAIL_LFSCK_DELAY2 0x1601
821 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
822 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
824 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
825 [ "$STATUS" == "scanning-phase1" ] ||
826 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
828 # Sleep 3 sec to guarantee at least one object processed by LFSCK
830 echo "stop $SINGLEMDS"
831 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
834 echo "start $SINGLEMDS"
835 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
837 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
838 mdd.${MDT_DEV}.lfsck_namespace |
839 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
841 error "(6) unexpected status"
844 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
850 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
852 for ((i = 0; i < 20; i++)); do
853 touch $DIR/$tdir/dummy${i}
856 #define OBD_FAIL_LFSCK_DELAY3 0x1602
857 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
858 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
859 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
860 mdd.${MDT_DEV}.lfsck_namespace |
861 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
863 error "(4) unexpected status"
867 echo "stop $SINGLEMDS"
868 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
871 echo "start $SINGLEMDS"
872 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
874 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
875 mdd.${MDT_DEV}.lfsck_namespace |
876 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
878 error "(7) unexpected status"
881 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
892 formatall > /dev/null
898 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
899 [ "$STATUS" == "init" ] ||
900 namespace_error "(2) Expect 'init', but got '$STATUS'"
902 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
904 mkdir $DIR/$tdir/crashed
906 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
908 for ((i = 0; i < 5; i++)); do
909 touch $DIR/$tdir/dummy${i}
912 umount_client $MOUNT || error "(3) Fail to stop client!"
914 #define OBD_FAIL_LFSCK_DELAY2 0x1601
915 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
917 namespace_error "(4) Fail to start LFSCK for namespace!"
919 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
920 [ "$STATUS" == "scanning-phase1" ] ||
921 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
923 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
925 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
926 [ "$STATUS" == "stopped" ] ||
927 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
930 namespace_error "(8) Fail to start LFSCK for namespace!"
932 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
933 [ "$STATUS" == "scanning-phase1" ] ||
934 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
936 #define OBD_FAIL_LFSCK_FATAL2 0x1609
937 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
942 namespace_error "(10) unexpected status"
945 #define OBD_FAIL_LFSCK_DELAY1 0x1600
946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
948 namespace_error "(11) Fail to start LFSCK for namespace!"
950 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
951 [ "$STATUS" == "scanning-phase1" ] ||
952 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
954 #define OBD_FAIL_LFSCK_CRASH 0x160a
955 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
958 echo "stop $SINGLEMDS"
959 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
961 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
962 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
964 echo "start $SINGLEMDS"
965 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
967 local timeout=$(max_recovery_time)
970 while [ $timer -lt $timeout ]; do
971 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
972 mdt.${MDT_DEV}.recovery_status |
973 awk '/^status/ { print \\\$2 }'")
974 [ "$STATUS" != "RECOVERING" ] && break;
979 [ $timer != $timeout ] || (
980 do_facet $SINGLEMDS "$LCTL get_param -n \
981 mdt.${MDT_DEV}.recovery_status"
982 error "(14.1) recovery timeout"
985 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
986 [ "$STATUS" == "crashed" ] ||
987 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
989 #define OBD_FAIL_LFSCK_DELAY2 0x1601
990 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
992 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
993 mdd.${MDT_DEV}.lfsck_namespace |
994 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
996 namespace_error "(17) unexpected status"
999 echo "stop $SINGLEMDS"
1000 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1002 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1003 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1005 echo "start $SINGLEMDS"
1006 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1009 while [ $timer -lt $timeout ]; do
1010 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1011 mdt.${MDT_DEV}.recovery_status |
1012 awk '/^status/ { print \\\$2 }'")
1013 [ "$STATUS" != "RECOVERING" ] && break;
1015 timer=$((timer + 1))
1018 [ $timer != $timeout ] || (
1019 do_facet $SINGLEMDS "$LCTL get_param -n \
1020 mdt.${MDT_DEV}.recovery_status"
1021 error "(19.1) recovery timeout"
1024 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1025 [ "$STATUS" == "paused" ] ||
1026 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1028 echo "stop $SINGLEMDS"
1029 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1031 echo "start $SINGLEMDS without resume LFSCK"
1032 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1035 while [ $timer -lt $timeout ]; do
1036 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1037 mdt.${MDT_DEV}.recovery_status |
1038 awk '/^status/ { print \\\$2 }'")
1039 [ "$STATUS" != "RECOVERING" ] && break;
1041 timer=$((timer + 1))
1044 [ $timer != $timeout ] || (
1045 do_facet $SINGLEMDS "$LCTL get_param -n \
1046 mdt.${MDT_DEV}.recovery_status"
1047 error "(20.3) recovery timeout"
1050 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1051 [ "$STATUS" == "paused" ] ||
1052 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1054 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1055 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1058 namespace_error "(21) Fail to start LFSCK for namespace!"
1059 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1060 mdd.${MDT_DEV}.lfsck_namespace |
1061 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1063 namespace_error "(22) unexpected status"
1066 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1067 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1068 mdd.${MDT_DEV}.lfsck_namespace |
1069 awk '/^flags/ { print \\\$2 }'" "scanned-once,inconsistent" 32 || {
1071 namespace_error "(23) Expect 'scanned-once,inconsistent'"
1074 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1075 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1076 mdd.${MDT_DEV}.lfsck_namespace |
1077 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1079 namespace_error "(24) unexpected status"
1082 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1084 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1086 run_test 8 "LFSCK state machine"
1089 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1090 skip "Testing on UP system, the speed may be inaccurate."
1094 check_mount_and_prep
1095 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1096 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1097 createmany -o $DIR/$tdir/lfsck/f 5000
1099 local BASE_SPEED1=100
1101 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1104 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1105 [ "$STATUS" == "scanning-phase1" ] ||
1106 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1108 local SPEED=$($SHOW_LAYOUT |
1109 awk '/^average_speed_phase1/ { print $2 }')
1111 # There may be time error, normally it should be less than 2 seconds.
1112 # We allow another 20% schedule error.
1114 # MAX_MARGIN = 1.3 = 13 / 10
1115 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1116 RUN_TIME1 * 13 / 10))
1117 [ $SPEED -lt $MAX_SPEED ] || {
1119 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1120 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1123 # adjust speed limit
1124 local BASE_SPEED2=300
1126 do_facet $SINGLEMDS \
1127 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1130 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1131 # MIN_MARGIN = 0.7 = 7 / 10
1132 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1133 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1134 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1135 [ $SPEED -gt $MIN_SPEED ] || {
1136 if [ $mds1_FSTYPE != ldiskfs ]; then
1137 error_ignore LU-5624 \
1138 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1141 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1145 # MAX_MARGIN = 1.3 = 13 / 10
1146 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1147 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1148 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1149 [ $SPEED -lt $MAX_SPEED ] || {
1151 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1152 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1153 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1156 do_nodes $(comma_list $(mdts_nodes)) \
1157 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1158 do_nodes $(comma_list $(osts_nodes)) \
1159 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1161 wait_update_facet $SINGLEMDS \
1162 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1163 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1164 error "(7) Failed to get expected 'completed'"
1166 run_test 9a "LFSCK speed control (1)"
1169 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1170 skip "Testing on UP system, the speed may be inaccurate."
1176 echo "Preparing another 50 * 50 files (with error) at $(date)."
1177 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1179 createmany -d $DIR/$tdir/d 50
1180 createmany -m $DIR/$tdir/f 50
1181 for ((i = 0; i < 50; i++)); do
1182 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1185 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1186 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1187 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1189 mdd.${MDT_DEV}.lfsck_namespace |
1190 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1192 error "(5) unexpected status"
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1196 echo "Prepared at $(date)."
1198 local BASE_SPEED1=50
1200 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1203 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1204 [ "$STATUS" == "scanning-phase2" ] ||
1205 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1207 local SPEED=$($SHOW_NAMESPACE |
1208 awk '/^average_speed_phase2/ { print $2 }')
1209 # There may be time error, normally it should be less than 2 seconds.
1210 # We allow another 20% schedule error.
1212 # MAX_MARGIN = 1.3 = 13 / 10
1213 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1214 RUN_TIME1 * 13 / 10))
1215 [ $SPEED -lt $MAX_SPEED ] || {
1217 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1218 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1221 # adjust speed limit
1222 local BASE_SPEED2=150
1224 do_facet $SINGLEMDS \
1225 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1228 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1229 # MIN_MARGIN = 0.7 = 7 / 10
1230 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1231 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1232 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1233 [ $SPEED -gt $MIN_SPEED ] || {
1234 if [ $mds1_FSTYPE != ldiskfs ]; then
1235 error_ignore LU-5624 \
1236 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1239 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1243 # MAX_MARGIN = 1.3 = 13 / 10
1244 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1245 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1246 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1247 [ $SPEED -lt $MAX_SPEED ] || {
1249 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1250 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1251 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1254 do_nodes $(comma_list $(mdts_nodes)) \
1255 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1256 do_nodes $(comma_list $(osts_nodes)) \
1257 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1258 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1259 mdd.${MDT_DEV}.lfsck_namespace |
1260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1262 error "(11) unexpected status"
1265 run_test 9b "LFSCK speed control (2)"
1269 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1273 echo "Preparing more files with error at $(date)."
1274 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1277 for ((i = 0; i < 1000; i = $((i+2)))); do
1278 mkdir -p $DIR/$tdir/d${i}
1279 touch $DIR/$tdir/f${i}
1280 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1283 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1286 for ((i = 1; i < 1000; i = $((i+2)))); do
1287 mkdir -p $DIR/$tdir/d${i}
1288 touch $DIR/$tdir/f${i}
1289 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1293 echo "Prepared at $(date)."
1295 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1297 umount_client $MOUNT
1298 mount_client $MOUNT || error "(3) Fail to start client!"
1300 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1303 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1304 [ "$STATUS" == "scanning-phase1" ] ||
1305 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1307 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1309 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1311 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1313 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1315 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1317 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1319 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1321 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1322 error "(14) Fail to softlink!"
1324 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1325 [ "$STATUS" == "scanning-phase1" ] ||
1326 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1328 do_nodes $(comma_list $(mdts_nodes)) \
1329 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1330 do_nodes $(comma_list $(osts_nodes)) \
1331 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1332 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1333 mdd.${MDT_DEV}.lfsck_namespace |
1334 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1336 error "(16) unexpected status"
1339 run_test 10 "System is available during LFSCK scanning"
1342 ost_remove_lastid() {
1345 local rcmd="do_facet ost${ost}"
1347 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1349 # step 1: local mount
1350 mount_fstype ost${ost} || return 1
1351 # step 2: remove the specified LAST_ID
1352 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1354 unmount_fstype ost${ost} || return 2
1358 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1359 skip "MDS older than 2.5.55, LU-1267"
1361 check_mount_and_prep
1362 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1363 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1368 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1370 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1371 error "(2) Fail to start ost1"
1373 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1374 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1376 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1377 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1379 wait_update_facet ost1 "$LCTL get_param -n \
1380 obdfilter.${OST_DEV}.lfsck_layout |
1381 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1383 error "(5) unexpected status"
1386 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1388 wait_update_facet ost1 "$LCTL get_param -n \
1389 obdfilter.${OST_DEV}.lfsck_layout |
1390 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1392 error "(6) unexpected status"
1395 echo "the LAST_ID(s) should have been rebuilt"
1396 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1397 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1399 run_test 11a "LFSCK can rebuild lost last_id"
1402 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1403 skip "MDS older than 2.5.55, LU-1267"
1405 check_mount_and_prep
1406 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1408 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1409 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1410 do_facet ost1 $LCTL set_param fail_loc=0x160d
1412 local count=$(precreated_ost_obj_count 0 0)
1414 createmany -o $DIR/$tdir/f $((count + 32))
1416 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1417 local seq=$(do_facet mds1 $LCTL get_param -n \
1418 osp.${proc_path}.prealloc_last_seq)
1419 local id_used=$(do_facet mds1 $LCTL get_param -n \
1420 osp.${proc_path}.prealloc_last_id)
1422 umount_client $MOUNT
1423 stop ost1 || error "(1) Fail to stop ost1"
1425 #define OBD_FAIL_OST_ENOSPC 0x215
1426 do_facet ost1 $LCTL set_param fail_loc=0x215
1428 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1429 error "(2) Fail to start ost1"
1431 for ((i = 0; i < 60; i++)); do
1432 id_ost1=$(do_facet ost1 \
1433 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1434 awk -F: "/$seq/ { print \$2 }")
1435 [ -n "$id_ost1" ] && break
1439 echo "the on-disk LAST_ID should be smaller than the expected one"
1440 [ $id_used -gt $id_ost1 ] ||
1441 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1443 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1444 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1446 wait_update_facet ost1 \
1447 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1448 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1450 error "(6) unexpected status"
1453 stop ost1 || error "(7) Fail to stop ost1"
1455 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1456 error "(8) Fail to start ost1"
1458 echo "the on-disk LAST_ID should have been rebuilt"
1459 # last_id may be larger than $id_used if objects were created/skipped
1460 wait_update_facet_cond ost1 \
1461 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1462 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1463 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1464 error "(9) expect last_id >= id_used $seq:$id_used"
1467 do_facet ost1 $LCTL set_param fail_loc=0
1468 stopall || error "(10) Fail to stopall"
1470 run_test 11b "LFSCK can rebuild crashed last_id"
1473 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1474 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1475 skip "MDS older than 2.5.55, LU-3950"
1477 check_mount_and_prep
1478 for k in $(seq $MDSCOUNT); do
1479 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1480 createmany -o $DIR/$tdir/${k}/f 100 ||
1481 error "(0) Fail to create 100 files."
1484 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1485 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1486 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1488 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1489 wait_all_targets namespace scanning-phase1 3
1491 echo "Stop namespace LFSCK on all targets by single lctl command."
1492 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1493 error "(4) Fail to stop LFSCK on all devices!"
1495 echo "All the LFSCK targets should be in 'stopped' status."
1496 wait_all_targets_blocked namespace stopped 5
1498 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1499 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1500 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1502 echo "All the LFSCK targets should be in 'completed' status."
1503 wait_all_targets_blocked namespace completed 7
1505 start_full_debug_logging
1507 echo "Start layout LFSCK on all targets by single command (-s 1)."
1508 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1509 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1511 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1512 wait_all_targets layout scanning-phase1 9
1514 echo "Stop layout LFSCK on all targets by single lctl command."
1515 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1516 error "(10) Fail to stop LFSCK on all devices!"
1518 echo "All the LFSCK targets should be in 'stopped' status."
1519 wait_all_targets_blocked layout stopped 11
1521 for k in $(seq $OSTCOUNT); do
1522 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1523 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1524 awk '/^status/ { print $2 }')
1525 [ "$STATUS" == "stopped" ] ||
1526 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1529 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1530 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1531 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1533 echo "All the LFSCK targets should be in 'completed' status."
1534 wait_all_targets_blocked layout completed 14
1536 stop_full_debug_logging
1538 run_test 12a "single command to trigger LFSCK on all devices"
1541 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1542 skip "MDS older than 2.5.55, LU-3950"
1544 check_mount_and_prep
1546 echo "Start LFSCK without '-M' specified."
1547 do_facet mds1 $LCTL lfsck_start -A -r ||
1548 error "(0) Fail to start LFSCK without '-M'"
1550 wait_all_targets_blocked namespace completed 1
1551 wait_all_targets_blocked layout completed 2
1553 local count=$(do_facet mds1 $LCTL dl |
1554 awk '{ print $3 }' | grep mdt | wc -l)
1555 if [ $count -gt 1 ]; then
1557 echo "Start layout LFSCK on the node with multipe targets,"
1558 echo "but not specify '-M'/'-A' option. Should get failure."
1560 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1561 error "(3) Start layout LFSCK should fail" || true
1564 run_test 12b "auto detect Lustre device"
1567 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1568 skip "MDS older than 2.5.55, LU-3593"
1571 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1572 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1573 echo "MDT-object FID."
1576 check_mount_and_prep
1578 echo "Inject failure stub to simulate bad lmm_oi"
1579 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1581 createmany -o $DIR/$tdir/f 1
1582 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1583 error "(0) Fail to create PFL $DIR/$tdir/f1"
1584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1586 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1587 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1589 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1590 mdd.${MDT_DEV}.lfsck_layout |
1591 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1593 error "(2) unexpected status"
1596 local repaired=$($SHOW_LAYOUT |
1597 awk '/^repaired_others/ { print $2 }')
1598 [ $repaired -eq 2 ] ||
1599 error "(3) Fail to repair crashed lmm_oi: $repaired"
1601 run_test 13 "LFSCK can repair crashed lmm_oi"
1604 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1605 skip "MDS older than 2.5.55, LU-3590"
1608 echo "The OST-object referenced by the MDT-object should be there;"
1609 echo "otherwise, the LFSCK should re-create the missing OST-object."
1610 echo "without '--delay-create-ostobj' option."
1613 check_mount_and_prep
1614 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1616 echo "Inject failure stub to simulate dangling referenced MDT-object"
1617 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1618 do_facet ost1 $LCTL set_param fail_loc=0x1610
1619 local count=$(precreated_ost_obj_count 0 0)
1621 createmany -o $DIR/$tdir/f $((count + 16)) ||
1622 error "(0.1) Fail to create $DIR/$tdir/fx"
1623 touch $DIR/$tdir/guard0
1625 for ((i = 0; i < 16; i++)); do
1626 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1627 $DIR/$tdir/f_comp${i} ||
1628 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1630 touch $DIR/$tdir/guard1
1632 do_facet ost1 $LCTL set_param fail_loc=0
1634 start_full_debug_logging
1636 # exhaust other pre-created dangling cases
1637 count=$(precreated_ost_obj_count 0 0)
1638 createmany -o $DIR/$tdir/a $count ||
1639 error "(0.5) Fail to create $count files."
1641 echo "'ls' should fail because of dangling referenced MDT-object"
1642 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1644 echo "Trigger layout LFSCK to find out dangling reference"
1645 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1647 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1648 mdd.${MDT_DEV}.lfsck_layout |
1649 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1651 error "(3) unexpected status"
1654 local repaired=$($SHOW_LAYOUT |
1655 awk '/^repaired_dangling/ { print $2 }')
1656 [ $repaired -ge 32 ] ||
1657 error "(4) Fail to repair dangling reference: $repaired"
1659 echo "'stat' should fail because of not repair dangling by default"
1660 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1661 error "(5.1) stat should fail"
1662 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1663 error "(5.2) stat should fail"
1665 echo "Trigger layout LFSCK to repair dangling reference"
1666 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1668 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1669 mdd.${MDT_DEV}.lfsck_layout |
1670 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1672 error "(7) unexpected status"
1675 # There may be some async LFSCK updates in processing, wait for
1676 # a while until the target reparation has been done. LU-4970.
1678 echo "'stat' should success after layout LFSCK repairing"
1679 wait_update_facet client "stat $DIR/$tdir/guard0 |
1680 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1681 stat $DIR/$tdir/guard0
1683 error "(8.1) unexpected size"
1686 wait_update_facet client "stat $DIR/$tdir/guard1 |
1687 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1688 stat $DIR/$tdir/guard1
1690 error "(8.2) unexpected size"
1693 repaired=$($SHOW_LAYOUT |
1694 awk '/^repaired_dangling/ { print $2 }')
1695 [ $repaired -ge 32 ] ||
1696 error "(9) Fail to repair dangling reference: $repaired"
1698 stop_full_debug_logging
1700 echo "stopall to cleanup object cache"
1703 setupall > /dev/null
1705 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1708 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1709 skip "MDS older than 2.5.55, LU-3590"
1712 echo "The OST-object referenced by the MDT-object should be there;"
1713 echo "otherwise, the LFSCK should re-create the missing OST-object."
1714 echo "with '--delay-create-ostobj' option."
1717 check_mount_and_prep
1718 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1720 echo "Inject failure stub to simulate dangling referenced MDT-object"
1721 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1722 do_facet ost1 $LCTL set_param fail_loc=0x1610
1723 local count=$(precreated_ost_obj_count 0 0)
1725 createmany -o $DIR/$tdir/f $((count + 31))
1726 touch $DIR/$tdir/guard
1727 do_facet ost1 $LCTL set_param fail_loc=0
1729 start_full_debug_logging
1731 # exhaust other pre-created dangling cases
1732 count=$(precreated_ost_obj_count 0 0)
1733 createmany -o $DIR/$tdir/a $count ||
1734 error "(0) Fail to create $count files."
1736 echo "'ls' should fail because of dangling referenced MDT-object"
1737 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1739 echo "Trigger layout LFSCK to find out dangling reference"
1740 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1742 wait_all_targets_blocked layout completed 3
1744 local repaired=$($SHOW_LAYOUT |
1745 awk '/^repaired_dangling/ { print $2 }')
1746 [ $repaired -ge 32 ] ||
1747 error "(4) Fail to repair dangling reference: $repaired"
1749 echo "'stat' should fail because of not repair dangling by default"
1750 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1752 echo "Trigger layout LFSCK to repair dangling reference"
1753 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1755 wait_all_targets_blocked layout completed 7
1757 # There may be some async LFSCK updates in processing, wait for
1758 # a while until the target reparation has been done. LU-4970.
1760 echo "'stat' should success after layout LFSCK repairing"
1761 wait_update_facet client "stat $DIR/$tdir/guard |
1762 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1763 stat $DIR/$tdir/guard
1765 error "(8) unexpected size"
1768 repaired=$($SHOW_LAYOUT |
1769 awk '/^repaired_dangling/ { print $2 }')
1770 [ $repaired -ge 32 ] ||
1771 error "(9) Fail to repair dangling reference: $repaired"
1773 stop_full_debug_logging
1775 echo "stopall to cleanup object cache"
1778 setupall > /dev/null
1780 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1783 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1784 skip "MDS older than 2.5.55, LU-3591"
1787 echo "If the OST-object referenced by the MDT-object back points"
1788 echo "to some non-exist MDT-object, then the LFSCK should repair"
1789 echo "the OST-object to back point to the right MDT-object."
1792 check_mount_and_prep
1793 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1795 echo "Inject failure stub to make the OST-object to back point to"
1796 echo "non-exist MDT-object."
1797 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1799 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1800 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1801 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1803 error "(0) Fail to create PFL $DIR/$tdir/f1"
1804 # 'dd' will trigger punch RPC firstly on every OST-objects.
1805 # So even though some OST-object will not be write by 'dd',
1806 # as long as it is allocated (may be NOT allocated in pfl_3b)
1807 # its layout information will be set also.
1808 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1809 cancel_lru_locks osc
1810 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1812 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1813 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1815 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1816 mdd.${MDT_DEV}.lfsck_layout |
1817 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1819 error "(2) unexpected status"
1822 local repaired=$($SHOW_LAYOUT |
1823 awk '/^repaired_unmatched_pair/ { print $2 }')
1824 [ $repaired -ge 3 ] ||
1825 error "(3) Fail to repair unmatched pair: $repaired"
1827 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1830 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1831 skip "MDS older than 2.5.55, LU-3591"
1834 echo "If the OST-object referenced by the MDT-object back points"
1835 echo "to other MDT-object that doesn't recognize the OST-object,"
1836 echo "then the LFSCK should repair it to back point to the right"
1837 echo "MDT-object (the first one)."
1840 check_mount_and_prep
1841 mkdir -p $DIR/$tdir/0
1842 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1843 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1844 cancel_lru_locks osc
1846 echo "Inject failure stub to make the OST-object to back point to"
1847 echo "other MDT-object"
1850 [ $OSTCOUNT -ge 2 ] && stripes=2
1852 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1853 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1854 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1855 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1857 error "(0) Fail to create PFL $DIR/$tdir/f1"
1858 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1859 cancel_lru_locks osc
1860 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1862 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1863 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1865 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1866 mdd.${MDT_DEV}.lfsck_layout |
1867 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1869 error "(2) unexpected status"
1872 local repaired=$($SHOW_LAYOUT |
1873 awk '/^repaired_unmatched_pair/ { print $2 }')
1874 [ $repaired -eq 4 ] ||
1875 error "(3) Fail to repair unmatched pair: $repaired"
1877 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1880 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1881 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1882 skip "MDS newer than 2.7.55, LU-6475"
1883 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1884 skip "MDS older than 2.5.55, LU-3591"
1887 echo "According to current metadata migration implementation,"
1888 echo "before the old MDT-object is removed, both the new MDT-object"
1889 echo "and old MDT-object will reference the same LOV layout. Then if"
1890 echo "the layout LFSCK finds the new MDT-object by race, it will"
1891 echo "regard related OST-object(s) as multiple referenced case, and"
1892 echo "will try to create new OST-object(s) for the new MDT-object."
1893 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1894 echo "MDT-object before confirm the multiple referenced case."
1897 check_mount_and_prep
1898 $LFS mkdir -i 1 $DIR/$tdir/a1
1899 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1900 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1901 cancel_lru_locks osc
1903 echo "Inject failure stub on MDT1 to delay the migration"
1905 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1906 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1907 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1908 $LFS migrate -m 0 $DIR/$tdir/a1 &
1911 echo "Trigger layout LFSCK to race with the migration"
1912 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1914 wait_all_targets_blocked layout completed 2
1916 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1917 local repaired=$($SHOW_LAYOUT |
1918 awk '/^repaired_unmatched_pair/ { print $2 }')
1919 [ $repaired -eq 1 ] ||
1920 error "(3) Fail to repair unmatched pair: $repaired"
1922 repaired=$($SHOW_LAYOUT |
1923 awk '/^repaired_multiple_referenced/ { print $2 }')
1924 [ $repaired -eq 0 ] ||
1925 error "(4) Unexpectedly repaird multiple references: $repaired"
1927 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1930 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1932 check_mount_and_prep
1934 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1935 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1936 error "setdirstripe failed"
1938 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
1939 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
1941 echo "Migrate $DIR/$tdir to MDT1"
1942 $LFS migrate -m 1 $DIR/$tdir &
1946 # fail sub transactions on random MDTs, which may cause some file
1948 #define OBD_FAIL_OUT_EIO 0x1709
1949 for ((i = 0; i < $MDSCOUNT; i++)); do
1950 do_facet mds$i $LCTL set_param fail_loc=0x1709
1952 do_facet mds$i $LCTL set_param fail_loc=0
1957 # LFSCK can't fully fix migrating directories, and may leave some
1958 # files inaccessible, but it shouldn't cause crash
1959 $START_NAMESPACE -A -r ||
1960 error "Fail to start LFSCK for namespace"
1962 wait_all_targets_blocked namespace completed 1
1964 # resume migration may fail because some file may be inaccessible, but
1965 # it shouldn't cause crash
1966 $LFS migrate -m 1 $DIR/$tdir
1968 # rm $tdir to avoid cleanup failure in the end
1970 $LFS rm_entry $DIR/$tdir/*
1972 REFORMAT="yes" cleanup_and_setup_lustre
1974 run_test 15d "LFSCK don't crash upon dir migration failure"
1977 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1978 skip "MDS older than 2.5.55, LU-3594"
1981 echo "If the OST-object's owner information does not match the owner"
1982 echo "information stored in the MDT-object, then the LFSCK trust the"
1983 echo "MDT-object and update the OST-object's owner information."
1986 check_mount_and_prep
1987 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1988 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1989 cancel_lru_locks osc
1991 # created but no setattr or write to the file.
1993 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1994 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1996 echo "Inject failure stub to skip OST-object owner changing"
1997 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1998 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1999 chown 1.1 $DIR/$tdir/f0
2000 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2002 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2005 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2008 mdd.${MDT_DEV}.lfsck_layout |
2009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2011 error "(2) unexpected status"
2014 local repaired=$($SHOW_LAYOUT |
2015 awk '/^repaired_inconsistent_owner/ { print $2 }')
2016 [ $repaired -eq 1 ] ||
2017 error "(3) Fail to repair inconsistent owner: $repaired"
2019 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2022 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2023 skip "MDS older than 2.5.55, LU-3594"
2026 echo "If more than one MDT-objects reference the same OST-object,"
2027 echo "and the OST-object only recognizes one MDT-object, then the"
2028 echo "LFSCK should create new OST-objects for such non-recognized"
2032 check_mount_and_prep
2033 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2035 echo "Inject failure stub to make two MDT-objects to refernce"
2036 echo "the OST-object"
2038 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2039 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2040 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2041 cancel_lru_locks mdc
2042 cancel_lru_locks osc
2044 createmany -o $DIR/$tdir/f 1
2045 cancel_lru_locks mdc
2046 cancel_lru_locks osc
2048 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2050 error "(0) Fail to create PFL $DIR/$tdir/f1"
2051 cancel_lru_locks mdc
2052 cancel_lru_locks osc
2053 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2055 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2056 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2057 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2058 [ $size -eq 1048576 ] ||
2059 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2061 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2062 [ $size -eq 1048576 ] ||
2063 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2065 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2068 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2070 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2071 mdd.${MDT_DEV}.lfsck_layout |
2072 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2074 error "(3) unexpected status"
2077 local repaired=$($SHOW_LAYOUT |
2078 awk '/^repaired_multiple_referenced/ { print $2 }')
2079 [ $repaired -eq 2 ] ||
2080 error "(4) Fail to repair multiple references: $repaired"
2082 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2083 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2084 error "(5) Fail to write f0."
2085 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2086 [ $size -eq 1048576 ] ||
2087 error "(6) guard size should be 1048576, but got $size"
2089 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2090 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2091 error "(7) Fail to write f1."
2092 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2093 [ $size -eq 1048576 ] ||
2094 error "(8) guard size should be 1048576, but got $size"
2096 run_test 17 "LFSCK can repair multiple references"
2098 $LCTL set_param debug=+cache > /dev/null
2101 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2102 skip "MDS older than 2.5.55, LU-3336"
2105 echo "The target MDT-object is there, but related stripe information"
2106 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2107 echo "layout EA entries."
2110 check_mount_and_prep
2111 $LFS mkdir -i 0 $DIR/$tdir/a1
2112 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2113 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2115 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2117 $LFS path2fid $DIR/$tdir/a1/f1
2118 $LFS getstripe $DIR/$tdir/a1/f1
2120 if [ $MDSCOUNT -ge 2 ]; then
2121 $LFS mkdir -i 1 $DIR/$tdir/a2
2122 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2123 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2124 $LFS path2fid $DIR/$tdir/a2/f2
2125 $LFS getstripe $DIR/$tdir/a2/f2
2128 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2129 error "(0) Fail to create PFL $DIR/$tdir/f3"
2131 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2133 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2135 $LFS path2fid $DIR/$tdir/f3
2136 $LFS getstripe $DIR/$tdir/f3
2138 cancel_lru_locks osc
2140 echo "Inject failure, to make the MDT-object lost its layout EA"
2141 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2142 do_facet mds1 $LCTL set_param fail_loc=0x1615
2143 chown 1.1 $DIR/$tdir/a1/f1
2145 if [ $MDSCOUNT -ge 2 ]; then
2146 do_facet mds2 $LCTL set_param fail_loc=0x1615
2147 chown 1.1 $DIR/$tdir/a2/f2
2150 chown 1.1 $DIR/$tdir/f3
2155 do_facet mds1 $LCTL set_param fail_loc=0
2156 if [ $MDSCOUNT -ge 2 ]; then
2157 do_facet mds2 $LCTL set_param fail_loc=0
2160 cancel_lru_locks mdc
2161 cancel_lru_locks osc
2163 echo "The file size should be incorrect since layout EA is lost"
2164 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2165 [ "$cur_size" != "$saved_size1" ] ||
2166 error "(1) Expect incorrect file1 size"
2168 if [ $MDSCOUNT -ge 2 ]; then
2169 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2170 [ "$cur_size" != "$saved_size1" ] ||
2171 error "(2) Expect incorrect file2 size"
2174 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2175 [ "$cur_size" != "$saved_size2" ] ||
2176 error "(1.2) Expect incorrect file3 size"
2178 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2179 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2181 for k in $(seq $MDSCOUNT); do
2182 # The LFSCK status query internal is 30 seconds. For the case
2183 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2184 # time to guarantee the status sync up.
2185 wait_update_facet mds${k} "$LCTL get_param -n \
2186 mdd.$(facet_svc mds${k}).lfsck_layout |
2187 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2188 error "(4) MDS${k} is not the expected 'completed'"
2191 for k in $(seq $OSTCOUNT); do
2192 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2193 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2194 awk '/^status/ { print $2 }')
2195 [ "$cur_status" == "completed" ] ||
2196 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2199 local repaired=$(do_facet mds1 $LCTL get_param -n \
2200 mdd.$(facet_svc mds1).lfsck_layout |
2201 awk '/^repaired_orphan/ { print $2 }')
2202 [ $repaired -eq 3 ] ||
2203 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2205 if [ $MDSCOUNT -ge 2 ]; then
2206 repaired=$(do_facet mds2 $LCTL get_param -n \
2207 mdd.$(facet_svc mds2).lfsck_layout |
2208 awk '/^repaired_orphan/ { print $2 }')
2209 [ $repaired -eq 2 ] ||
2210 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2213 $LFS path2fid $DIR/$tdir/a1/f1
2214 $LFS getstripe $DIR/$tdir/a1/f1
2216 if [ $MDSCOUNT -ge 2 ]; then
2217 $LFS path2fid $DIR/$tdir/a2/f2
2218 $LFS getstripe $DIR/$tdir/a2/f2
2221 $LFS path2fid $DIR/$tdir/f3
2222 $LFS getstripe $DIR/$tdir/f3
2224 echo "The file size should be correct after layout LFSCK scanning"
2225 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2226 [ "$cur_size" == "$saved_size1" ] ||
2227 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2229 if [ $MDSCOUNT -ge 2 ]; then
2230 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2231 [ "$cur_size" == "$saved_size1" ] ||
2232 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2235 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2236 [ "$cur_size" == "$saved_size2" ] ||
2237 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2239 run_test 18a "Find out orphan OST-object and repair it (1)"
2242 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2243 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2244 skip "MDS older than 2.5.55, LU-3336"
2247 echo "The target MDT-object is lost. The LFSCK should re-create the"
2248 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2249 echo "can move it back to normal namespace manually."
2252 check_mount_and_prep
2253 $LFS mkdir -i 0 $DIR/$tdir/a1
2254 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2255 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2256 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2257 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2259 $LFS getstripe $DIR/$tdir/a1/f1
2261 if [ $MDSCOUNT -ge 2 ]; then
2262 $LFS mkdir -i 1 $DIR/$tdir/a2
2263 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2264 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2265 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2267 $LFS getstripe $DIR/$tdir/a2/f2
2270 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2271 error "(0) Fail to create PFL $DIR/$tdir/f3"
2273 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2275 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2276 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2278 $LFS getstripe $DIR/$tdir/f3
2280 cancel_lru_locks osc
2282 echo "Inject failure, to simulate the case of missing the MDT-object"
2283 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2284 do_facet mds1 $LCTL set_param fail_loc=0x1616
2285 rm -f $DIR/$tdir/a1/f1
2287 if [ $MDSCOUNT -ge 2 ]; then
2288 do_facet mds2 $LCTL set_param fail_loc=0x1616
2289 rm -f $DIR/$tdir/a2/f2
2297 do_facet mds1 $LCTL set_param fail_loc=0
2298 if [ $MDSCOUNT -ge 2 ]; then
2299 do_facet mds2 $LCTL set_param fail_loc=0
2302 cancel_lru_locks mdc
2303 cancel_lru_locks osc
2305 # dryrun mode only check orphans, not repaie
2306 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2307 $START_LAYOUT --dryrun -o -r ||
2308 error "Fail to start layout LFSCK in dryrun mode"
2309 wait_all_targets_blocked layout completed 2
2311 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2312 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2313 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2315 local orphans=$(do_facet mds1 $LCTL get_param -n \
2316 mdd.$(facet_svc mds1).lfsck_layout |
2317 awk '/^inconsistent_orphan/ { print $2 }')
2318 [ $orphans -eq 3 ] ||
2319 error "Expect 3 found on mds1, but got: $orphans"
2321 # orphan parents should not be created
2323 for subdir in $MOUNT/.lustre/lost+found/*; do
2324 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2327 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2328 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2330 for k in $(seq $MDSCOUNT); do
2331 # The LFSCK status query internal is 30 seconds. For the case
2332 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2333 # time to guarantee the status sync up.
2334 wait_update_facet mds${k} "$LCTL get_param -n \
2335 mdd.$(facet_svc mds${k}).lfsck_layout |
2336 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2337 error "(2) MDS${k} is not the expected 'completed'"
2340 for k in $(seq $OSTCOUNT); do
2341 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2342 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2343 awk '/^status/ { print $2 }')
2344 [ "$cur_status" == "completed" ] ||
2345 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2348 local repaired=$(do_facet mds1 $LCTL get_param -n \
2349 mdd.$(facet_svc mds1).lfsck_layout |
2350 awk '/^repaired_orphan/ { print $2 }')
2351 [ $repaired -eq 3 ] ||
2352 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2354 if [ $MDSCOUNT -ge 2 ]; then
2355 repaired=$(do_facet mds2 $LCTL get_param -n \
2356 mdd.$(facet_svc mds2).lfsck_layout |
2357 awk '/^repaired_orphan/ { print $2 }')
2358 [ $repaired -eq 2 ] ||
2359 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2362 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2363 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2364 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2366 if [ $MDSCOUNT -ge 2 ]; then
2367 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2368 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2371 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2372 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2374 $LFS path2fid $DIR/$tdir/a1/f1
2375 $LFS getstripe $DIR/$tdir/a1/f1
2377 if [ $MDSCOUNT -ge 2 ]; then
2378 $LFS path2fid $DIR/$tdir/a2/f2
2379 $LFS getstripe $DIR/$tdir/a2/f2
2382 $LFS path2fid $DIR/$tdir/f3
2383 $LFS getstripe $DIR/$tdir/f3
2385 echo "The file size should be correct after layout LFSCK scanning"
2386 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2387 [ "$cur_size" == "$saved_size1" ] ||
2388 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2390 if [ $MDSCOUNT -ge 2 ]; then
2391 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2392 [ "$cur_size" == "$saved_size1" ] ||
2393 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2396 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2397 [ "$cur_size" == "$saved_size2" ] ||
2398 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2400 run_test 18b "Find out orphan OST-object and repair it (2)"
2403 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2404 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2405 skip "MDS older than 2.5.55, LU-3336"
2408 echo "The target MDT-object is lost, and the OST-object FID is missing."
2409 echo "The LFSCK should re-create the MDT-object with new FID under the "
2410 echo "directory .lustre/lost+found/MDTxxxx."
2413 check_mount_and_prep
2414 $LFS mkdir -i 0 $DIR/$tdir/a1
2415 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2417 echo "Inject failure, to simulate the case of missing parent FID"
2418 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2419 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2421 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2422 $LFS getstripe $DIR/$tdir/a1/f1
2424 if [ $MDSCOUNT -ge 2 ]; then
2425 $LFS mkdir -i 1 $DIR/$tdir/a2
2426 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2427 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2428 $LFS getstripe $DIR/$tdir/a2/f2
2431 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2432 error "(0) Fail to create PFL $DIR/$tdir/f3"
2434 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2435 $LFS getstripe $DIR/$tdir/f3
2437 cancel_lru_locks osc
2438 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2440 echo "Inject failure, to simulate the case of missing the MDT-object"
2441 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2442 do_facet mds1 $LCTL set_param fail_loc=0x1616
2443 rm -f $DIR/$tdir/a1/f1
2445 if [ $MDSCOUNT -ge 2 ]; then
2446 do_facet mds2 $LCTL set_param fail_loc=0x1616
2447 rm -f $DIR/$tdir/a2/f2
2455 do_facet mds1 $LCTL set_param fail_loc=0
2456 if [ $MDSCOUNT -ge 2 ]; then
2457 do_facet mds2 $LCTL set_param fail_loc=0
2460 cancel_lru_locks mdc
2461 cancel_lru_locks osc
2463 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2464 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2466 for k in $(seq $MDSCOUNT); do
2467 # The LFSCK status query internal is 30 seconds. For the case
2468 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2469 # time to guarantee the status sync up.
2470 wait_update_facet mds${k} "$LCTL get_param -n \
2471 mdd.$(facet_svc mds${k}).lfsck_layout |
2472 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2473 error "(2) MDS${k} is not the expected 'completed'"
2476 for k in $(seq $OSTCOUNT); do
2477 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2478 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2479 awk '/^status/ { print $2 }')
2480 [ "$cur_status" == "completed" ] ||
2481 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2484 if [ $MDSCOUNT -ge 2 ]; then
2490 local repaired=$(do_facet mds1 $LCTL get_param -n \
2491 mdd.$(facet_svc mds1).lfsck_layout |
2492 awk '/^repaired_orphan/ { print $2 }')
2493 [ $repaired -eq $expected ] ||
2494 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2496 if [ $MDSCOUNT -ge 2 ]; then
2497 repaired=$(do_facet mds2 $LCTL get_param -n \
2498 mdd.$(facet_svc mds2).lfsck_layout |
2499 awk '/^repaired_orphan/ { print $2 }')
2500 [ $repaired -eq 0 ] ||
2501 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2504 ls -ail $MOUNT/.lustre/lost+found/
2506 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2507 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2508 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2510 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2513 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2514 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2515 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2517 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2518 [ ! -z "$cname" ] ||
2519 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2521 run_test 18c "Find out orphan OST-object and repair it (3)"
2524 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2525 skip "MDS older than 2.5.55, LU-3336"
2528 echo "The target MDT-object layout EA is corrupted, but the right"
2529 echo "OST-object is still alive as orphan. The layout LFSCK will"
2530 echo "not create new OST-object to occupy such slot."
2533 check_mount_and_prep
2535 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2536 echo "guard" > $DIR/$tdir/a1/f1
2537 echo "foo" > $DIR/$tdir/a1/f2
2539 echo "guard" > $DIR/$tdir/a1/f3
2540 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2541 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2542 echo "foo" > $DIR/$tdir/a1/f4
2544 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2545 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2546 $LFS path2fid $DIR/$tdir/a1/f1
2547 $LFS getstripe $DIR/$tdir/a1/f1
2548 $LFS path2fid $DIR/$tdir/a1/f2
2549 $LFS getstripe $DIR/$tdir/a1/f2
2550 $LFS path2fid $DIR/$tdir/a1/f3
2551 $LFS getstripe $DIR/$tdir/a1/f3
2552 $LFS path2fid $DIR/$tdir/a1/f4
2553 $LFS getstripe $DIR/$tdir/a1/f4
2554 cancel_lru_locks osc
2556 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2557 echo "to reference the same OST-object (which is f1's OST-obejct)."
2558 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2559 echo "dangling reference case, but f2's old OST-object is there."
2561 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2562 echo "to reference the same OST-object (which is f3's OST-obejct)."
2563 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2564 echo "dangling reference case, but f4's old OST-object is there."
2567 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2569 chown 1.1 $DIR/$tdir/a1/f2
2570 chown 1.1 $DIR/$tdir/a1/f4
2571 rm -f $DIR/$tdir/a1/f1
2572 rm -f $DIR/$tdir/a1/f3
2575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2577 echo "stopall to cleanup object cache"
2580 setupall > /dev/null
2582 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2583 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2585 for k in $(seq $MDSCOUNT); do
2586 # The LFSCK status query internal is 30 seconds. For the case
2587 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2588 # time to guarantee the status sync up.
2589 wait_update_facet mds${k} "$LCTL get_param -n \
2590 mdd.$(facet_svc mds${k}).lfsck_layout |
2591 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2592 error "(3) MDS${k} is not the expected 'completed'"
2595 for k in $(seq $OSTCOUNT); do
2596 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2597 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2598 awk '/^status/ { print $2 }')
2599 [ "$cur_status" == "completed" ] ||
2600 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2603 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2604 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2605 awk '/^repaired_orphan/ { print $2 }')
2606 [ $repaired -eq 2 ] ||
2607 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2609 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2610 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2611 awk '/^repaired_dangling/ { print $2 }')
2612 [ $repaired -eq 0 ] ||
2613 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2615 echo "The file size should be correct after layout LFSCK scanning"
2616 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2617 [ "$cur_size" == "$saved_size1" ] ||
2618 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2620 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2621 [ "$cur_size" == "$saved_size2" ] ||
2622 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2624 echo "The LFSCK should find back the original data."
2625 cat $DIR/$tdir/a1/f2
2626 $LFS path2fid $DIR/$tdir/a1/f2
2627 $LFS getstripe $DIR/$tdir/a1/f2
2628 cat $DIR/$tdir/a1/f4
2629 $LFS path2fid $DIR/$tdir/a1/f4
2630 $LFS getstripe $DIR/$tdir/a1/f4
2632 run_test 18d "Find out orphan OST-object and repair it (4)"
2635 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2636 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2637 skip "MDS older than 2.5.55, LU-3336"
2640 echo "The target MDT-object layout EA slot is occpuied by some new"
2641 echo "created OST-object when repair dangling reference case. Such"
2642 echo "conflict OST-object has been modified by others. To keep the"
2643 echo "new data, the LFSCK will create a new file to refernece this"
2644 echo "old orphan OST-object."
2647 check_mount_and_prep
2649 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2650 echo "guard" > $DIR/$tdir/a1/f1
2651 echo "foo" > $DIR/$tdir/a1/f2
2653 echo "guard" > $DIR/$tdir/a1/f3
2654 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2655 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2656 echo "foo" > $DIR/$tdir/a1/f4
2658 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2659 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2661 $LFS path2fid $DIR/$tdir/a1/f1
2662 $LFS getstripe $DIR/$tdir/a1/f1
2663 $LFS path2fid $DIR/$tdir/a1/f2
2664 $LFS getstripe $DIR/$tdir/a1/f2
2665 $LFS path2fid $DIR/$tdir/a1/f3
2666 $LFS getstripe $DIR/$tdir/a1/f3
2667 $LFS path2fid $DIR/$tdir/a1/f4
2668 $LFS getstripe $DIR/$tdir/a1/f4
2669 cancel_lru_locks osc
2671 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2672 echo "to reference the same OST-object (which is f1's OST-obejct)."
2673 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2674 echo "dangling reference case, but f2's old OST-object is there."
2676 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2677 echo "to reference the same OST-object (which is f3's OST-obejct)."
2678 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2679 echo "dangling reference case, but f4's old OST-object is there."
2682 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2683 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2684 chown 1.1 $DIR/$tdir/a1/f2
2685 chown 1.1 $DIR/$tdir/a1/f4
2686 rm -f $DIR/$tdir/a1/f1
2687 rm -f $DIR/$tdir/a1/f3
2690 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2692 echo "stopall to cleanup object cache"
2695 setupall > /dev/null
2697 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2698 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2700 start_full_debug_logging
2702 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2703 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2705 wait_update_facet mds1 "$LCTL get_param -n \
2706 mdd.$(facet_svc mds1).lfsck_layout |
2707 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2708 error "(3) MDS1 is not the expected 'scanning-phase2'"
2710 # to guarantee all updates are synced.
2714 echo "Write new data to f2/f4 to modify the new created OST-object."
2715 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2716 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2718 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2720 for k in $(seq $MDSCOUNT); do
2721 # The LFSCK status query internal is 30 seconds. For the case
2722 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2723 # time to guarantee the status sync up.
2724 wait_update_facet mds${k} "$LCTL get_param -n \
2725 mdd.$(facet_svc mds${k}).lfsck_layout |
2726 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2727 error "(4) MDS${k} is not the expected 'completed'"
2730 for k in $(seq $OSTCOUNT); do
2731 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2732 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2733 awk '/^status/ { print $2 }')
2734 [ "$cur_status" == "completed" ] ||
2735 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2738 stop_full_debug_logging
2740 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2741 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2742 awk '/^repaired_orphan/ { print $2 }')
2743 [ $repaired -eq 2 ] ||
2744 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2746 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2747 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2748 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2750 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2751 if [ $count -ne 2 ]; then
2752 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2753 error "(8) Expect 2 stubs under lost+found, but got $count"
2756 echo "The stub file should keep the original f2 or f4 data"
2757 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2758 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2759 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2760 error "(9) Got unexpected $cur_size"
2763 $LFS path2fid $cname
2764 $LFS getstripe $cname
2766 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2767 cur_size=$(ls -il $cname | awk '{ print $6 }')
2768 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2769 error "(10) Got unexpected $cur_size"
2772 $LFS path2fid $cname
2773 $LFS getstripe $cname
2775 echo "The f2/f4 should contains new data."
2776 cat $DIR/$tdir/a1/f2
2777 $LFS path2fid $DIR/$tdir/a1/f2
2778 $LFS getstripe $DIR/$tdir/a1/f2
2779 cat $DIR/$tdir/a1/f4
2780 $LFS path2fid $DIR/$tdir/a1/f4
2781 $LFS getstripe $DIR/$tdir/a1/f4
2783 run_test 18e "Find out orphan OST-object and repair it (5)"
2786 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2789 echo "The target MDT-object is lost. The LFSCK should re-create the"
2790 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2791 echo "to verify some OST-object(s) during the first stage-scanning,"
2792 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2793 echo "should not be affected."
2796 check_mount_and_prep
2797 $LFS mkdir -i 0 $DIR/$tdir/a1
2798 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2799 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2800 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2801 $LFS mkdir -i 0 $DIR/$tdir/a2
2802 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2803 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2804 $LFS getstripe $DIR/$tdir/a1/f1
2805 $LFS getstripe $DIR/$tdir/a2/f2
2807 if [ $MDSCOUNT -ge 2 ]; then
2808 $LFS mkdir -i 1 $DIR/$tdir/a3
2809 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2810 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2811 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2812 $LFS mkdir -i 1 $DIR/$tdir/a4
2813 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2814 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2815 $LFS getstripe $DIR/$tdir/a3/f3
2816 $LFS getstripe $DIR/$tdir/a4/f4
2819 cancel_lru_locks osc
2821 echo "Inject failure, to simulate the case of missing the MDT-object"
2822 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2823 do_facet mds1 $LCTL set_param fail_loc=0x1616
2824 rm -f $DIR/$tdir/a1/f1
2825 rm -f $DIR/$tdir/a2/f2
2827 if [ $MDSCOUNT -ge 2 ]; then
2828 do_facet mds2 $LCTL set_param fail_loc=0x1616
2829 rm -f $DIR/$tdir/a3/f3
2830 rm -f $DIR/$tdir/a4/f4
2836 do_facet mds1 $LCTL set_param fail_loc=0
2837 if [ $MDSCOUNT -ge 2 ]; then
2838 do_facet mds2 $LCTL set_param fail_loc=0
2841 cancel_lru_locks mdc
2842 cancel_lru_locks osc
2844 echo "Inject failure, to simulate the OST0 fail to handle"
2845 echo "MDT0 LFSCK request during the first-stage scanning."
2846 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2847 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2849 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2850 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2852 for k in $(seq $MDSCOUNT); do
2853 # The LFSCK status query internal is 30 seconds. For the case
2854 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2855 # time to guarantee the status sync up.
2856 wait_update_facet mds${k} "$LCTL get_param -n \
2857 mdd.$(facet_svc mds${k}).lfsck_layout |
2858 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2859 error "(2) MDS${k} is not the expected 'partial'"
2862 wait_update_facet ost1 "$LCTL get_param -n \
2863 obdfilter.$(facet_svc ost1).lfsck_layout |
2864 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2865 error "(3) OST1 is not the expected 'partial'"
2868 wait_update_facet ost2 "$LCTL get_param -n \
2869 obdfilter.$(facet_svc ost2).lfsck_layout |
2870 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2871 error "(4) OST2 is not the expected 'completed'"
2874 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2876 local repaired=$(do_facet mds1 $LCTL get_param -n \
2877 mdd.$(facet_svc mds1).lfsck_layout |
2878 awk '/^repaired_orphan/ { print $2 }')
2879 [ $repaired -eq 1 ] ||
2880 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2882 if [ $MDSCOUNT -ge 2 ]; then
2883 repaired=$(do_facet mds2 $LCTL get_param -n \
2884 mdd.$(facet_svc mds2).lfsck_layout |
2885 awk '/^repaired_orphan/ { print $2 }')
2886 [ $repaired -eq 1 ] ||
2887 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2890 echo "Trigger layout LFSCK on all devices again to cleanup"
2891 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2893 for k in $(seq $MDSCOUNT); do
2894 # The LFSCK status query internal is 30 seconds. For the case
2895 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2896 # time to guarantee the status sync up.
2897 wait_update_facet mds${k} "$LCTL get_param -n \
2898 mdd.$(facet_svc mds${k}).lfsck_layout |
2899 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2900 error "(8) MDS${k} is not the expected 'completed'"
2903 for k in $(seq $OSTCOUNT); do
2904 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2905 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2906 awk '/^status/ { print $2 }')
2907 [ "$cur_status" == "completed" ] ||
2908 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2912 local repaired=$(do_facet mds1 $LCTL get_param -n \
2913 mdd.$(facet_svc mds1).lfsck_layout |
2914 awk '/^repaired_orphan/ { print $2 }')
2915 [ $repaired -eq 2 ] ||
2916 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2918 if [ $MDSCOUNT -ge 2 ]; then
2919 repaired=$(do_facet mds2 $LCTL get_param -n \
2920 mdd.$(facet_svc mds2).lfsck_layout |
2921 awk '/^repaired_orphan/ { print $2 }')
2922 [ $repaired -eq 2 ] ||
2923 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2926 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2929 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2932 echo "The target MDT-object is lost, but related OI mapping is there"
2933 echo "The LFSCK should recreate the lost MDT-object without affected"
2934 echo "by the stale OI mapping."
2937 check_mount_and_prep
2938 $LFS mkdir -i 0 $DIR/$tdir/a1
2939 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2940 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2941 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2943 $LFS getstripe $DIR/$tdir/a1/f1
2944 cancel_lru_locks osc
2946 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2947 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2948 do_facet mds1 $LCTL set_param fail_loc=0x162e
2949 rm -f $DIR/$tdir/a1/f1
2951 do_facet mds1 $LCTL set_param fail_loc=0
2952 cancel_lru_locks mdc
2953 cancel_lru_locks osc
2955 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2956 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2958 for k in $(seq $MDSCOUNT); do
2959 # The LFSCK status query internal is 30 seconds. For the case
2960 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2961 # time to guarantee the status sync up.
2962 wait_update_facet mds${k} "$LCTL get_param -n \
2963 mdd.$(facet_svc mds${k}).lfsck_layout |
2964 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2965 error "(2) MDS${k} is not the expected 'completed'"
2968 for k in $(seq $OSTCOUNT); do
2969 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2970 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2971 awk '/^status/ { print $2 }')
2972 [ "$cur_status" == "completed" ] ||
2973 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2976 local repaired=$(do_facet mds1 $LCTL get_param -n \
2977 mdd.$(facet_svc mds1).lfsck_layout |
2978 awk '/^repaired_orphan/ { print $2 }')
2979 [ $repaired -eq $OSTCOUNT ] ||
2980 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2982 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2983 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2984 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2986 $LFS path2fid $DIR/$tdir/a1/f1
2987 $LFS getstripe $DIR/$tdir/a1/f1
2989 run_test 18g "Find out orphan OST-object and repair it (7)"
2993 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2994 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2995 echo "scanning its OST-object(s). Then in the second stage scanning,"
2996 echo "the OST will return related OST-object(s) to the MDT as orphan."
2997 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2998 echo "the 'orphan(s)' stripe information."
3001 check_mount_and_prep
3003 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3004 error "(0) Fail to create PFL $DIR/$tdir/f0"
3006 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3007 error "(1.1) Fail to write $DIR/$tdir/f0"
3009 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3010 error "(1.2) Fail to write $DIR/$tdir/f0"
3012 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3014 echo "Inject failure stub to simulate bad PFL extent range"
3015 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3016 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3018 chown 1.1 $DIR/$tdir/f0
3020 cancel_lru_locks mdc
3021 cancel_lru_locks osc
3022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3024 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3025 error "(2) Write to bad PFL file should fail"
3027 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3028 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3030 for k in $(seq $MDSCOUNT); do
3031 # The LFSCK status query internal is 30 seconds. For the case
3032 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3033 # time to guarantee the status sync up.
3034 wait_update_facet mds${k} "$LCTL get_param -n \
3035 mdd.$(facet_svc mds${k}).lfsck_layout |
3036 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3037 error "(4.1) MDS${k} is not the expected 'completed'"
3040 for k in $(seq $OSTCOUNT); do
3041 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3042 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3043 awk '/^status/ { print $2 }')
3044 [ "$cur_status" == "completed" ] ||
3045 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3049 local repaired=$($SHOW_LAYOUT |
3050 awk '/^repaired_orphan/ { print $2 }')
3051 [ $repaired -eq 2 ] ||
3052 error "(5) Fail to repair crashed PFL range: $repaired"
3054 echo "Data in $DIR/$tdir/f0 should not be broken"
3055 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3056 error "(6) Data in $DIR/$tdir/f0 is broken"
3058 echo "Write should succeed after LFSCK repairing the bad PFL range"
3059 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3060 error "(7) Write should succeed after LFSCK"
3062 run_test 18h "LFSCK can repair crashed PFL extent range"
3064 $LCTL set_param debug=-cache > /dev/null
3067 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3068 skip "MDS older than 2.5.55, LU-3951"
3070 check_mount_and_prep
3071 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3073 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3074 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3076 echo "foo1" > $DIR/$tdir/a0
3077 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3078 error "(0) Fail to create PFL $DIR/$tdir/a1"
3079 echo "foo2" > $DIR/$tdir/a1
3080 echo "guard" > $DIR/$tdir/a2
3081 cancel_lru_locks osc
3083 echo "Inject failure, then client will offer wrong parent FID when read"
3084 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3085 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3087 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3088 $LCTL set_param fail_loc=0x1619
3090 echo "Read RPC with wrong parent FID should be denied"
3091 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3092 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3093 $LCTL set_param fail_loc=0
3095 run_test 19a "OST-object inconsistency self detect"
3098 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3099 skip "MDS older than 2.5.55, LU-3951"
3101 check_mount_and_prep
3102 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3104 echo "Inject failure stub to make the OST-object to back point to"
3105 echo "non-exist MDT-object"
3107 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3108 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3110 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3111 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3112 echo "foo1" > $DIR/$tdir/f0
3113 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3114 error "(0) Fail to create PFL $DIR/$tdir/f1"
3115 echo "foo2" > $DIR/$tdir/f1
3116 cancel_lru_locks osc
3117 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3119 do_facet ost1 $LCTL set_param -n \
3120 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3121 echo "Nothing should be fixed since self detect and repair is disabled"
3122 local repaired=$(do_facet ost1 $LCTL get_param -n \
3123 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3124 awk '/^repaired/ { print $2 }')
3125 [ $repaired -eq 0 ] ||
3126 error "(1) Expected 0 repaired, but got $repaired"
3128 echo "Read RPC with right parent FID should be accepted,"
3129 echo "and cause parent FID on OST to be fixed"
3131 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3132 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3134 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3135 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3137 repaired=$(do_facet ost1 $LCTL get_param -n \
3138 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3139 awk '/^repaired/ { print $2 }')
3140 [ $repaired -eq 2 ] ||
3141 error "(3) Expected 1 repaired, but got $repaired"
3143 run_test 19b "OST-object inconsistency self repair"
3145 PATTERN_WITH_HOLE="40000001"
3146 PATTERN_WITHOUT_HOLE="raid0"
3149 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3150 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3151 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3152 skip "MDS older than 2.5.55, LU-4887"
3155 echo "The target MDT-object and some of its OST-object are lost."
3156 echo "The LFSCK should find out the left OST-objects and re-create"
3157 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3158 echo "with the partial OST-objects (LOV EA hole)."
3160 echo "New client can access the file with LOV EA hole via normal"
3161 echo "system tools or commands without crash the system."
3163 echo "For old client, even though it cannot access the file with"
3164 echo "LOV EA hole, it should not cause the system crash."
3167 check_mount_and_prep
3168 $LFS mkdir -i 0 $DIR/$tdir/a1
3169 if [ $OSTCOUNT -gt 2 ]; then
3170 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3173 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3177 # 256 blocks on the stripe0.
3178 # 1 block on the stripe1 for 2 OSTs case.
3179 # 256 blocks on the stripe1 for other cases.
3180 # 1 block on the stripe2 if OSTs > 2
3181 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3182 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3183 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3185 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3186 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3187 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3190 $LFS getstripe $DIR/$tdir/a1/f0
3192 $LFS getstripe $DIR/$tdir/a1/f1
3194 $LFS getstripe $DIR/$tdir/a1/f2
3196 if [ $OSTCOUNT -gt 2 ]; then
3197 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3198 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3200 $LFS getstripe $DIR/$tdir/a1/f3
3203 cancel_lru_locks osc
3205 echo "Inject failure..."
3206 echo "To simulate f0 lost MDT-object"
3207 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3208 do_facet mds1 $LCTL set_param fail_loc=0x1616
3209 rm -f $DIR/$tdir/a1/f0
3211 echo "To simulate f1 lost MDT-object and OST-object0"
3212 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3213 do_facet mds1 $LCTL set_param fail_loc=0x161a
3214 rm -f $DIR/$tdir/a1/f1
3216 echo "To simulate f2 lost MDT-object and OST-object1"
3217 do_facet mds1 $LCTL set_param fail_val=1
3218 rm -f $DIR/$tdir/a1/f2
3220 if [ $OSTCOUNT -gt 2 ]; then
3221 echo "To simulate f3 lost MDT-object and OST-object2"
3222 do_facet mds1 $LCTL set_param fail_val=2
3223 rm -f $DIR/$tdir/a1/f3
3226 umount_client $MOUNT
3229 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3231 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3232 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3234 for k in $(seq $MDSCOUNT); do
3235 # The LFSCK status query internal is 30 seconds. For the case
3236 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3237 # time to guarantee the status sync up.
3238 wait_update_facet mds${k} "$LCTL get_param -n \
3239 mdd.$(facet_svc mds${k}).lfsck_layout |
3240 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3241 error "(2) MDS${k} is not the expected 'completed'"
3244 for k in $(seq $OSTCOUNT); do
3245 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3246 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3247 awk '/^status/ { print $2 }')
3248 [ "$cur_status" == "completed" ] ||
3249 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3252 local repaired=$(do_facet mds1 $LCTL get_param -n \
3253 mdd.$(facet_svc mds1).lfsck_layout |
3254 awk '/^repaired_orphan/ { print $2 }')
3255 if [ $OSTCOUNT -gt 2 ]; then
3256 [ $repaired -eq 9 ] ||
3257 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3259 [ $repaired -eq 4 ] ||
3260 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3263 mount_client $MOUNT || error "(5.0) Fail to start client!"
3265 LOV_PATTERN_F_HOLE=0x40000000
3268 # ${fid0}-R-0 is the old f0
3270 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3271 echo "Check $name, which is the old f0"
3273 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3275 local pattern=$($LFS getstripe -L $name)
3276 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3277 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3279 local stripes=$($LFS getstripe -c $name)
3280 if [ $OSTCOUNT -gt 2 ]; then
3281 [ $stripes -eq 3 ] ||
3282 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3284 [ $stripes -eq 2 ] ||
3285 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3288 local size=$(stat $name | awk '/Size:/ { print $2 }')
3289 [ $size -eq $((4096 * $bcount)) ] ||
3290 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3292 cat $name > /dev/null || error "(5.5) cannot read $name"
3294 echo "dummy" >> $name || error "(5.6) cannot write $name"
3296 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3298 touch $name || error "(5.8) cannot touch $name"
3300 rm -f $name || error "(5.9) cannot unlink $name"
3303 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3305 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3306 if [ $OSTCOUNT -gt 2 ]; then
3307 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3309 echo "Check $name, it contains the old f1's stripe1"
3312 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3314 pattern=$($LFS getstripe -L $name)
3315 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3316 error "(6.2) expect pattern flag hole, but got $pattern"
3318 stripes=$($LFS getstripe -c $name)
3319 if [ $OSTCOUNT -gt 2 ]; then
3320 [ $stripes -eq 3 ] ||
3321 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3323 [ $stripes -eq 2 ] ||
3324 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3327 size=$(stat $name | awk '/Size:/ { print $2 }')
3328 [ $size -eq $((4096 * $bcount)) ] ||
3329 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3331 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3333 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3334 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3337 [ $failures -eq 256 ] ||
3338 error "(6.6) expect 256 IO failures, but get $failures"
3340 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3341 [ $size -eq $((4096 * $bcount)) ] ||
3342 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3344 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3345 error "(6.8) write to the LOV EA hole should fail"
3347 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3348 error "(6.9) write to normal stripe should NOT fail"
3350 echo "foo" >> $name && error "(6.10) append write $name should fail"
3352 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3354 touch $name || error "(6.12) cannot touch $name"
3356 rm -f $name || error "(6.13) cannot unlink $name"
3359 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3361 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3362 if [ $OSTCOUNT -gt 2 ]; then
3363 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3365 echo "Check $name, it contains the old f2's stripe0"
3368 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3370 pattern=$($LFS getstripe -L $name)
3371 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3372 error "(7.2) expect pattern flag hole, but got $pattern"
3374 stripes=$($LFS getstripe -c $name)
3375 size=$(stat $name | awk '/Size:/ { print $2 }')
3376 if [ $OSTCOUNT -gt 2 ]; then
3377 [ $stripes -eq 3 ] ||
3378 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3380 [ $size -eq $((4096 * $bcount)) ] ||
3381 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3383 cat $name > /dev/null &&
3384 error "(7.5.1) normal read $name should fail"
3386 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3387 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3389 [ $failures -eq 256 ] ||
3390 error "(7.6) expect 256 IO failures, but get $failures"
3392 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3393 [ $size -eq $((4096 * $bcount)) ] ||
3394 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3396 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3397 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3399 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3400 error "(7.8.1) write to normal stripe should NOT fail"
3402 echo "foo" >> $name &&
3403 error "(7.8.3) append write $name should fail"
3405 chown $RUNAS_ID:$RUNAS_GID $name ||
3406 error "(7.9.1) cannot chown on $name"
3408 touch $name || error "(7.10.1) cannot touch $name"
3410 [ $stripes -eq 2 ] ||
3411 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3414 [ $size -eq $((4096 * (256 + 0))) ] ||
3415 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3417 cat $name > /dev/null &&
3418 error "(7.5.2) normal read $name should fail"
3420 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3421 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3422 [ $failures -eq 256 ] ||
3423 error "(7.6.2) expect 256 IO failures, but get $failures"
3426 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3427 [ $size -eq $((4096 * $bcount)) ] ||
3428 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3430 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3431 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3433 chown $RUNAS_ID:$RUNAS_GID $name ||
3434 error "(7.9.2) cannot chown on $name"
3436 touch $name || error "(7.10.2) cannot touch $name"
3439 rm -f $name || error "(7.11) cannot unlink $name"
3441 [ $OSTCOUNT -le 2 ] && return
3444 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3446 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3447 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3449 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3451 pattern=$($LFS getstripe -L $name)
3452 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3453 error "(8.2) expect pattern flag hole, but got $pattern"
3455 stripes=$($LFS getstripe -c $name)
3456 [ $stripes -eq 3 ] ||
3457 error "(8.3) expect the stripe count is 3, but got $stripes"
3459 size=$(stat $name | awk '/Size:/ { print $2 }')
3461 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3462 error "(8.4) expect the size $((4096 * 512)), but got $size"
3464 cat $name > /dev/null &&
3465 error "(8.5) normal read $name should fail"
3467 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3468 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3470 [ $failures -eq 256 ] ||
3471 error "(8.6) expect 256 IO failures, but get $failures"
3474 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3475 [ $size -eq $((4096 * $bcount)) ] ||
3476 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3478 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3479 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3481 chown $RUNAS_ID:$RUNAS_GID $name ||
3482 error "(8.9) cannot chown on $name"
3484 touch $name || error "(8.10) cannot touch $name"
3486 rm -f $name || error "(8.11) cannot unlink $name"
3488 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3491 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3492 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3493 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3494 skip "MDS older than 2.5.55, LU-4887"
3497 echo "The target MDT-object and some of its OST-object are lost."
3498 echo "The LFSCK should find out the left OST-objects and re-create"
3499 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3500 echo "with the partial OST-objects (LOV EA hole)."
3502 echo "New client can access the file with LOV EA hole via normal"
3503 echo "system tools or commands without crash the system - PFL case."
3506 check_mount_and_prep
3508 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3509 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3510 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3511 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3512 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3513 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3515 local bcount=$((256 * 3 + 1))
3517 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3518 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3519 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3521 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3522 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3523 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3526 $LFS getstripe $DIR/$tdir/f0
3528 $LFS getstripe $DIR/$tdir/f1
3530 $LFS getstripe $DIR/$tdir/f2
3532 cancel_lru_locks mdc
3533 cancel_lru_locks osc
3535 echo "Inject failure..."
3536 echo "To simulate f0 lost MDT-object"
3537 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3541 echo "To simulate the case of f1 lost MDT-object and "
3542 echo "the first OST-object in each PFL component"
3543 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3544 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3547 echo "To simulate the case of f2 lost MDT-object and "
3548 echo "the second OST-object in each PFL component"
3549 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3556 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3557 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3559 for k in $(seq $MDSCOUNT); do
3560 # The LFSCK status query internal is 30 seconds. For the case
3561 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3562 # time to guarantee the status sync up.
3563 wait_update_facet mds${k} "$LCTL get_param -n \
3564 mdd.$(facet_svc mds${k}).lfsck_layout |
3565 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3566 error "(4) MDS${k} is not the expected 'completed'"
3569 for k in $(seq $OSTCOUNT); do
3570 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3571 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3572 awk '/^status/ { print $2 }')
3573 [ "$cur_status" == "completed" ] ||
3574 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3577 local repaired=$(do_facet mds1 $LCTL get_param -n \
3578 mdd.$(facet_svc mds1).lfsck_layout |
3579 awk '/^repaired_orphan/ { print $2 }')
3580 [ $repaired -eq 8 ] ||
3581 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3584 # ${fid0}-R-0 is the old f0
3586 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3587 echo "Check $name, which is the old f0"
3589 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3591 local pattern=$($LFS getstripe -L -I1 $name)
3592 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3593 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3595 pattern=$($LFS getstripe -L -I2 $name)
3596 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3597 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3599 local stripes=$($LFS getstripe -c -I1 $name)
3600 [ $stripes -eq 2 ] ||
3601 error "(7.3.1) expect 2 stripes, but got $stripes"
3603 stripes=$($LFS getstripe -c -I2 $name)
3604 [ $stripes -eq 2 ] ||
3605 error "(7.3.2) expect 2 stripes, but got $stripes"
3607 local e_start=$($LFS getstripe -I1 $name |
3608 awk '/lcme_extent.e_start:/ { print $2 }')
3609 [ $e_start -eq 0 ] ||
3610 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3612 local e_end=$($LFS getstripe -I1 $name |
3613 awk '/lcme_extent.e_end:/ { print $2 }')
3614 [ $e_end -eq 2097152 ] ||
3615 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3617 e_start=$($LFS getstripe -I2 $name |
3618 awk '/lcme_extent.e_start:/ { print $2 }')
3619 [ $e_start -eq 2097152 ] ||
3620 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3622 e_end=$($LFS getstripe -I2 $name |
3623 awk '/lcme_extent.e_end:/ { print $2 }')
3624 [ "$e_end" = "EOF" ] ||
3625 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3627 local size=$(stat $name | awk '/Size:/ { print $2 }')
3628 [ $size -eq $((4096 * $bcount)) ] ||
3629 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3631 cat $name > /dev/null || error "(7.7) cannot read $name"
3633 echo "dummy" >> $name || error "(7.8) cannot write $name"
3635 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3637 touch $name || error "(7.10) cannot touch $name"
3639 rm -f $name || error "(7.11) cannot unlink $name"
3642 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3644 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3645 echo "Check $name, it contains f1's second OST-object in each COMP"
3647 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3649 pattern=$($LFS getstripe -L -I1 $name)
3650 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3651 error "(8.2.1) expect pattern flag hole, but got $pattern"
3653 pattern=$($LFS getstripe -L -I2 $name)
3654 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3655 error "(8.2.2) expect pattern flag hole, but got $pattern"
3657 stripes=$($LFS getstripe -c -I1 $name)
3658 [ $stripes -eq 2 ] ||
3659 error "(8.3.2) expect 2 stripes, but got $stripes"
3661 stripes=$($LFS getstripe -c -I2 $name)
3662 [ $stripes -eq 2 ] ||
3663 error "(8.3.2) expect 2 stripes, but got $stripes"
3665 e_start=$($LFS getstripe -I1 $name |
3666 awk '/lcme_extent.e_start:/ { print $2 }')
3667 [ $e_start -eq 0 ] ||
3668 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3670 e_end=$($LFS getstripe -I1 $name |
3671 awk '/lcme_extent.e_end:/ { print $2 }')
3672 [ $e_end -eq 2097152 ] ||
3673 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3675 e_start=$($LFS getstripe -I2 $name |
3676 awk '/lcme_extent.e_start:/ { print $2 }')
3677 [ $e_start -eq 2097152 ] ||
3678 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3680 e_end=$($LFS getstripe -I2 $name |
3681 awk '/lcme_extent.e_end:/ { print $2 }')
3682 [ "$e_end" = "EOF" ] ||
3683 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3685 size=$(stat $name | awk '/Size:/ { print $2 }')
3686 [ $size -eq $((4096 * $bcount)) ] ||
3687 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3689 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3691 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3692 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3694 # The first stripe in each COMP was lost
3695 [ $failures -eq 512 ] ||
3696 error "(8.8) expect 512 IO failures, but get $failures"
3698 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3699 [ $size -eq $((4096 * $bcount)) ] ||
3700 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3702 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3703 error "(8.10) write to the LOV EA hole should fail"
3705 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3706 error "(8.11) write to normal stripe should NOT fail"
3708 echo "foo" >> $name && error "(8.12) append write $name should fail"
3710 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3712 touch $name || error "(8.14) cannot touch $name"
3714 rm -f $name || error "(8.15) cannot unlink $name"
3717 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3719 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3720 echo "Check $name, it contains f2's first stripe in each COMP"
3722 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3724 pattern=$($LFS getstripe -L -I1 $name)
3725 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3726 error "(9.2.1) expect pattern flag hole, but got $pattern"
3728 pattern=$($LFS getstripe -L -I2 $name)
3729 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3730 error "(9.2.2) expect pattern flag hole, but got $pattern"
3732 stripes=$($LFS getstripe -c -I1 $name)
3733 [ $stripes -eq 2 ] ||
3734 error "(9.3.2) expect 2 stripes, but got $stripes"
3736 stripes=$($LFS getstripe -c -I2 $name)
3737 [ $stripes -eq 2 ] ||
3738 error "(9.3.2) expect 2 stripes, but got $stripes"
3740 e_start=$($LFS getstripe -I1 $name |
3741 awk '/lcme_extent.e_start:/ { print $2 }')
3742 [ $e_start -eq 0 ] ||
3743 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3745 e_end=$($LFS getstripe -I1 $name |
3746 awk '/lcme_extent.e_end:/ { print $2 }')
3747 [ $e_end -eq 2097152 ] ||
3748 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3750 e_start=$($LFS getstripe -I2 $name |
3751 awk '/lcme_extent.e_start:/ { print $2 }')
3752 [ $e_start -eq 2097152 ] ||
3753 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3755 e_end=$($LFS getstripe -I2 $name |
3756 awk '/lcme_extent.e_end:/ { print $2 }')
3757 [ "$e_end" = "EOF" ] ||
3758 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3760 size=$(stat $name | awk '/Size:/ { print $2 }')
3761 # The second stripe in COMP was lost, so we do not know there
3762 # have ever been some data before. 'stat' will regard it as
3763 # no data on the lost stripe.
3765 [ $size -eq $((4096 * $bcount)) ] ||
3766 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3768 cat $name > /dev/null &&
3769 error "(9.7) normal read $name should fail"
3771 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3772 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3773 [ $failures -eq 512 ] ||
3774 error "(9.8) expect 256 IO failures, but get $failures"
3776 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3777 # The second stripe in COMP was lost, so we do not know there
3778 # have ever been some data before. Since 'dd' skip failure,
3779 # it will regard the lost stripe contains data.
3781 [ $size -eq $((4096 * $bcount)) ] ||
3782 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3784 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3785 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3787 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3788 error "(9.11) write to normal stripe should NOT fail"
3790 echo "foo" >> $name &&
3791 error "(9.12) append write $name should fail"
3793 chown $RUNAS_ID:$RUNAS_GID $name ||
3794 error "(9.13) cannot chown on $name"
3796 touch $name || error "(9.14) cannot touch $name"
3798 rm -f $name || error "(7.15) cannot unlink $name"
3800 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3803 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3804 skip "MDS older than 2.5.59, LU-4887"
3806 check_mount_and_prep
3807 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3809 echo "Start all LFSCK components by default (-s 1)"
3810 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3811 error "Fail to start LFSCK"
3813 echo "namespace LFSCK should be in 'scanning-phase1' status"
3814 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3815 [ "$STATUS" == "scanning-phase1" ] ||
3816 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3818 echo "layout LFSCK should be in 'scanning-phase1' status"
3819 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3820 [ "$STATUS" == "scanning-phase1" ] ||
3821 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3823 echo "Stop all LFSCK components by default"
3824 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3825 error "Fail to stop LFSCK"
3827 run_test 21 "run all LFSCK components by default"
3830 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3831 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3832 skip "MDS older than 2.6.50, LU-5511"
3835 echo "The parent_A references the child directory via some name entry,"
3836 echo "but the child directory back references another parent_B via its"
3837 echo "".." name entry. The parent_B does not exist. Then the namespace"
3838 echo "LFSCK will repair the child directory's ".." name entry."
3841 check_mount_and_prep
3843 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3844 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3846 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3847 echo "The dummy's dotdot name entry references the guard."
3848 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3849 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3850 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3851 error "(3) Fail to mkdir on MDT0"
3852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3854 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3856 echo "Trigger namespace LFSCK to repair unmatched pairs"
3857 $START_NAMESPACE -A -r ||
3858 error "(5) Fail to start LFSCK for namespace"
3860 wait_all_targets_blocked namespace completed 6
3862 local repaired=$($SHOW_NAMESPACE |
3863 awk '/^unmatched_pairs_repaired/ { print $2 }')
3864 [ $repaired -eq 1 ] ||
3865 error "(7) Fail to repair unmatched pairs: $repaired"
3867 echo "'ls' should success after namespace LFSCK repairing"
3868 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3869 error "(8) ls should success."
3871 run_test 22a "LFSCK can repair unmatched pairs (1)"
3874 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3875 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3876 skip "MDS older than 2.6.50, LU-5511"
3879 echo "The parent_A references the child directory via the name entry_B,"
3880 echo "but the child directory back references another parent_C via its"
3881 echo "".." name entry. The parent_C exists, but there is no the name"
3882 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3883 echo "the child directory's ".." name entry and its linkEA."
3886 check_mount_and_prep
3888 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3889 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3891 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3892 echo "and bad linkEA. The dummy's dotdot name entry references the"
3893 echo "guard. The dummy's linkEA references n non-exist name entry."
3894 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3896 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3897 error "(3) Fail to mkdir on MDT0"
3898 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3900 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3901 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3902 local dummyname=$($LFS fid2path $DIR $dummyfid)
3903 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3904 error "(4) fid2path works unexpectedly."
3906 echo "Trigger namespace LFSCK to repair unmatched pairs"
3907 $START_NAMESPACE -A -r ||
3908 error "(5) Fail to start LFSCK for namespace"
3910 wait_all_targets_blocked namespace completed 6
3912 local repaired=$($SHOW_NAMESPACE |
3913 awk '/^unmatched_pairs_repaired/ { print $2 }')
3914 [ $repaired -eq 1 ] ||
3915 error "(7) Fail to repair unmatched pairs: $repaired"
3917 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3918 local dummyname=$($LFS fid2path $DIR $dummyfid)
3919 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3920 error "(8) fid2path does not work"
3922 run_test 22b "LFSCK can repair unmatched pairs (2)"
3925 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3926 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3927 skip "MDS older than 2.6.50, LU-5512"
3930 echo "The name entry is there, but the MDT-object for such name "
3931 echo "entry does not exist. The namespace LFSCK should find out "
3932 echo "and repair the inconsistency as required."
3935 check_mount_and_prep
3937 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3938 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3940 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3941 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3942 do_facet mds2 $LCTL set_param fail_loc=0x1620
3943 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3944 do_facet mds2 $LCTL set_param fail_loc=0
3946 echo "'ls' should fail because of dangling name entry"
3947 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3949 echo "Trigger namespace LFSCK to find out dangling name entry"
3950 $START_NAMESPACE -A -r ||
3951 error "(5) Fail to start LFSCK for namespace"
3953 wait_all_targets_blocked namespace completed 6
3955 local repaired=$($SHOW_NAMESPACE |
3956 awk '/^dangling_repaired/ { print $2 }')
3957 [ $repaired -eq 1 ] ||
3958 error "(7) Fail to repair dangling name entry: $repaired"
3960 echo "'ls' should fail because not re-create MDT-object by default"
3961 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3963 echo "Trigger namespace LFSCK again to repair dangling name entry"
3964 $START_NAMESPACE -A -r -C ||
3965 error "(9) Fail to start LFSCK for namespace"
3967 wait_all_targets_blocked namespace completed 10
3969 repaired=$($SHOW_NAMESPACE |
3970 awk '/^dangling_repaired/ { print $2 }')
3971 [ $repaired -eq 1 ] ||
3972 error "(11) Fail to repair dangling name entry: $repaired"
3974 echo "'ls' should success after namespace LFSCK repairing"
3975 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3977 run_test 23a "LFSCK can repair dangling name entry (1)"
3980 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3981 skip "MDS older than 2.6.50, LU-5512"
3984 echo "The objectA has multiple hard links, one of them corresponding"
3985 echo "to the name entry_B. But there is something wrong for the name"
3986 echo "entry_B and cause entry_B to references non-exist object_C."
3987 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3988 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3989 echo "comes to the second-stage scanning, it will find that the"
3990 echo "former re-creating object_C is not proper, and will try to"
3991 echo "replace the object_C with the real object_A."
3994 check_mount_and_prep
3996 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3997 $LFS path2fid $DIR/$tdir/d0
3999 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4001 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4002 $LFS path2fid $DIR/$tdir/d0/f0
4004 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4005 $LFS path2fid $DIR/$tdir/d0/f1
4007 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4008 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4010 if [ "$SEQ0" != "$SEQ1" ]; then
4011 # To guarantee that the f0 and f1 are in the same FID seq
4012 rm -f $DIR/$tdir/d0/f0 ||
4013 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4014 echo "dummy" > $DIR/$tdir/d0/f0 ||
4015 error "(3.2) Fail to touch on MDT0"
4016 $LFS path2fid $DIR/$tdir/d0/f0
4019 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4020 OID=$(printf %d $OID)
4022 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4023 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4024 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4025 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4026 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4028 # If there is creation after the dangling injection, it may re-use
4029 # the just released local object (inode) that is referenced by the
4030 # dangling name entry. It will fail the dangling injection.
4031 # So before deleting the target object for the dangling name entry,
4032 # remove some other objects to avoid the target object being reused
4033 # by some potential creations. LU-7429
4034 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4036 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4038 echo "'ls' should fail because of dangling name entry"
4039 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4040 error "(6) ls should fail."
4042 echo "Trigger namespace LFSCK to find out dangling name entry"
4043 $START_NAMESPACE -r -C ||
4044 error "(7) Fail to start LFSCK for namespace"
4046 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4047 mdd.${MDT_DEV}.lfsck_namespace |
4048 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4050 error "(8) unexpected status"
4053 local repaired=$($SHOW_NAMESPACE |
4054 awk '/^dangling_repaired/ { print $2 }')
4055 [ $repaired -eq 1 ] ||
4056 error "(9) Fail to repair dangling name entry: $repaired"
4058 repaired=$($SHOW_NAMESPACE |
4059 awk '/^multiple_linked_repaired/ { print $2 }')
4060 [ $repaired -eq 1 ] ||
4061 error "(10) Fail to drop the former created object: $repaired"
4063 local data=$(cat $DIR/$tdir/d0/foo)
4064 [ "$data" == "dummy" ] ||
4065 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4067 run_test 23b "LFSCK can repair dangling name entry (2)"
4070 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4071 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4072 mdd.${MDT_DEV}.lfsck_namespace |
4073 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4075 error "(10) unexpected status"
4078 stop_full_debug_logging
4082 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4083 skip "MDS older than 2.6.50, LU-5512"
4086 echo "The objectA has multiple hard links, one of them corresponding"
4087 echo "to the name entry_B. But there is something wrong for the name"
4088 echo "entry_B and cause entry_B to references non-exist object_C."
4089 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4090 echo "as dangling, and re-create the lost object_C. And then others"
4091 echo "modified the re-created object_C. When the LFSCK comes to the"
4092 echo "second-stage scanning, it will find that the former re-creating"
4093 echo "object_C maybe wrong and try to replace the object_C with the"
4094 echo "real object_A. But because object_C has been modified, so the"
4095 echo "LFSCK cannot replace it."
4098 start_full_debug_logging
4100 check_mount_and_prep
4102 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4103 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4104 echo "parent_fid=$parent_fid"
4106 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4108 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4109 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4110 echo "f0_fid=$f0_fid"
4112 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4113 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4114 echo "f1_fid=$f1_fid"
4116 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4117 # To guarantee that the f0 and f1 are in the same FID seq
4118 rm -f $DIR/$tdir/d0/f0 ||
4119 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4120 echo "dummy" > $DIR/$tdir/d0/f0 ||
4121 error "(3.2) Fail to touch on MDT0"
4122 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4123 echo "f0_fid=$f0_fid (replaced)"
4126 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4128 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4129 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4130 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4131 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4132 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4134 # If there is creation after the dangling injection, it may re-use
4135 # the just released local object (inode) that is referenced by the
4136 # dangling name entry. It will fail the dangling injection.
4137 # So before deleting the target object for the dangling name entry,
4138 # remove some other objects to avoid the target object being reused
4139 # by some potential creations. LU-7429
4140 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4142 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4144 echo "'ls' should fail because of dangling name entry"
4145 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4146 error "(6) ls should fail."
4148 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4149 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4151 echo "Trigger namespace LFSCK to find out dangling name entry"
4152 $START_NAMESPACE -r -C ||
4153 error "(7) Fail to start LFSCK for namespace"
4155 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4156 # While unexpected by the test, it is valid for LFSCK to repair
4157 # the link to the original object before any data is written.
4158 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4160 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4161 log "LFSCK repaired file prematurely"
4166 stat $DIR/$tdir/d0/foo
4168 error "(8) unexpected size"
4171 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4172 cancel_lru_locks osc
4176 local repaired=$($SHOW_NAMESPACE |
4177 awk '/^dangling_repaired/ { print $2 }')
4178 [ $repaired -eq 1 ] ||
4179 error "(11) Fail to repair dangling name entry: $repaired"
4181 local data=$(cat $DIR/$tdir/d0/foo)
4182 [ "$data" != "dummy" ] ||
4183 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4185 run_test 23c "LFSCK can repair dangling name entry (3)"
4188 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4189 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4190 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4191 skip "MDS older than 2.6.50, LU-5513"
4194 echo "Two MDT-objects back reference the same name entry via their"
4195 echo "each own linkEA entry, but the name entry only references one"
4196 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4197 echo "for the MDT-object that is not recognized. If such MDT-object"
4198 echo "has no other linkEA entry after the removing, then the LFSCK"
4199 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4202 check_mount_and_prep
4204 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4206 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4207 $LFS path2fid $DIR/$tdir/d0/guard
4209 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4210 $LFS path2fid $DIR/$tdir/d0/dummy
4213 if [ $mds1_FSTYPE != ldiskfs ]; then
4214 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4216 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4219 touch $DIR/$tdir/d0/guard/foo ||
4220 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4222 echo "Inject failure stub on MDT0 to simulate the case that"
4223 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4224 echo "that references $DIR/$tdir/d0/guard/foo."
4225 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4226 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4227 echo "there with the same linkEA entry as another MDT-object"
4228 echo "$DIR/$tdir/d0/guard/foo has"
4230 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4232 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4233 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4234 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4235 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4236 rmdir $DIR/$tdir/d0/dummy/foo ||
4237 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4240 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4241 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4242 error "(6) stat successfully unexpectedly"
4244 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4245 $START_NAMESPACE -A -r ||
4246 error "(7) Fail to start LFSCK for namespace"
4248 wait_all_targets_blocked namespace completed 8
4250 local repaired=$($SHOW_NAMESPACE |
4251 awk '/^multiple_referenced_repaired/ { print $2 }')
4252 [ $repaired -eq 1 ] ||
4253 error "(9) Fail to repair multiple referenced name entry: $repaired"
4255 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4256 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4257 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4259 local cname="$cfid-$pfid-D-0"
4260 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4261 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4263 run_test 24 "LFSCK can repair multiple-referenced name entry"
4266 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4267 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4268 skip "MDS older than 2.6.50, LU-5515"
4271 echo "The file type in the name entry does not match the file type"
4272 echo "claimed by the referenced object. Then the LFSCK will update"
4273 echo "the file type in the name entry."
4276 check_mount_and_prep
4278 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4280 echo "Inject failure stub on MDT0 to simulate the case that"
4281 echo "the file type stored in the name entry is wrong."
4283 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4285 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4286 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4288 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4289 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4291 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4292 mdd.${MDT_DEV}.lfsck_namespace |
4293 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4295 error "(4) unexpected status"
4298 local repaired=$($SHOW_NAMESPACE |
4299 awk '/^bad_file_type_repaired/ { print $2 }')
4300 [ $repaired -eq 1 ] ||
4301 error "(5) Fail to repair bad file type in name entry: $repaired"
4303 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4305 run_test 25 "LFSCK can repair bad file type in the name entry"
4308 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4309 skip "MDS older than 2.6.50, LU-5516"
4312 echo "The local name entry back referenced by the MDT-object is lost."
4313 echo "The namespace LFSCK will add the missing local name entry back"
4314 echo "to the normal namespace."
4317 check_mount_and_prep
4319 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4320 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4321 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4323 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4324 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4326 echo "Inject failure stub on MDT0 to simulate the case that"
4327 echo "foo's name entry will be removed, but the foo's object"
4328 echo "and its linkEA are kept in the system."
4330 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4332 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4335 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4336 error "(5) 'ls' should fail"
4338 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4339 $START_NAMESPACE -r -A ||
4340 error "(6) Fail to start LFSCK for namespace"
4342 wait_all_targets_blocked namespace completed 7
4344 local repaired=$($SHOW_NAMESPACE |
4345 awk '/^lost_dirent_repaired/ { print $2 }')
4346 [ $repaired -eq 1 ] ||
4347 error "(8) Fail to repair lost dirent: $repaired"
4349 ls -ail $DIR/$tdir/d0/foo ||
4350 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4352 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4353 [ "$foofid" == "$foofid2" ] ||
4354 error "(10) foo's FID changed: $foofid, $foofid2"
4356 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4359 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4360 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4361 skip "MDS older than 2.6.50, LU-5516"
4364 echo "The remote name entry back referenced by the MDT-object is lost."
4365 echo "The namespace LFSCK will add the missing remote name entry back"
4366 echo "to the normal namespace."
4369 check_mount_and_prep
4371 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4372 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4373 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4375 echo "Inject failure stub on MDT0 to simulate the case that"
4376 echo "foo's name entry will be removed, but the foo's object"
4377 echo "and its linkEA are kept in the system."
4379 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4381 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4382 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4384 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4385 error "(4) 'ls' should fail"
4387 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4388 $START_NAMESPACE -r -A ||
4389 error "(5) Fail to start LFSCK for namespace"
4391 wait_all_targets_blocked namespace completed 6
4393 local repaired=$($SHOW_NAMESPACE |
4394 awk '/^lost_dirent_repaired/ { print $2 }')
4395 [ $repaired -eq 1 ] ||
4396 error "(7) Fail to repair lost dirent: $repaired"
4398 ls -ail $DIR/$tdir/d0/foo ||
4399 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4401 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4402 [ "$foofid" == "$foofid2" ] ||
4403 error "(9) foo's FID changed: $foofid, $foofid2"
4405 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4408 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4409 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4410 skip "MDS older than 2.6.50, LU-5516"
4413 echo "The local parent referenced by the MDT-object linkEA is lost."
4414 echo "The namespace LFSCK will re-create the lost parent as orphan."
4417 check_mount_and_prep
4419 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4420 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4421 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4422 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4424 echo "Inject failure stub on MDT0 to simulate the case that"
4425 echo "foo's name entry will be removed, but the foo's object"
4426 echo "and its linkEA are kept in the system. And then remove"
4427 echo "another hard link and the parent directory."
4429 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4430 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4431 rm -f $DIR/$tdir/d0/foo ||
4432 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4433 rm -f $DIR/$tdir/d0/dummy ||
4434 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4435 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4437 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4438 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4440 echo "Trigger namespace LFSCK to repair the lost parent"
4441 $START_NAMESPACE -r -A ||
4442 error "(6) Fail to start LFSCK for namespace"
4444 wait_all_targets_blocked namespace completed 7
4446 local repaired=$($SHOW_NAMESPACE |
4447 awk '/^lost_dirent_repaired/ { print $2 }')
4448 [ $repaired -eq 1 ] ||
4449 error "(8) Fail to repair lost dirent: $repaired"
4451 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4452 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4453 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4455 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4457 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4458 [ ! -z "$cname" ] ||
4459 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4461 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4464 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4465 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4466 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4467 skip "MDS older than 2.6.50, LU-5516"
4470 echo "The remote parent referenced by the MDT-object linkEA is lost."
4471 echo "The namespace LFSCK will re-create the lost parent as orphan."
4474 check_mount_and_prep
4476 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4477 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4479 $LFS path2fid $DIR/$tdir/d0
4481 echo "Inject failure stub on MDT0 to simulate the case that"
4482 echo "foo's name entry will be removed, but the foo's object"
4483 echo "and its linkEA are kept in the system. And then remove"
4484 echo "the parent directory."
4486 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4487 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4488 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4489 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4491 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4492 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4494 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4495 $START_NAMESPACE -r -A ||
4496 error "(6) Fail to start LFSCK for namespace"
4498 wait_all_targets_blocked namespace completed 7
4500 local repaired=$($SHOW_NAMESPACE |
4501 awk '/^lost_dirent_repaired/ { print $2 }')
4502 [ $repaired -eq 1 ] ||
4503 error "(8) Fail to repair lost dirent: $repaired"
4505 ls -ail $MOUNT/.lustre/lost+found/
4507 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4508 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4509 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4511 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4513 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4514 [ ! -z "$cname" ] ||
4515 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4517 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4520 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4521 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4522 skip "MDS older than 2.6.50, LU-5506"
4525 echo "The target name entry is lost. The LFSCK should insert the"
4526 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4527 echo "the MDT (on which the orphan MDT-object resides) has ever"
4528 echo "failed to respond some name entry verification during the"
4529 echo "first stage-scanning, then the LFSCK should skip to handle"
4530 echo "orphan MDT-object on this MDT. But other MDTs should not"
4534 check_mount_and_prep
4535 $LFS mkdir -i 0 $DIR/$tdir/d1
4536 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4537 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4539 $LFS mkdir -i 1 $DIR/$tdir/d2
4540 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4541 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4543 echo "Inject failure stub on MDT0 to simulate the case that"
4544 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4545 echo "and its linkEA are kept in the system. And the case that"
4546 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4547 echo "and its linkEA are kept in the system."
4549 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4550 do_facet mds1 $LCTL set_param fail_loc=0x1624
4551 do_facet mds2 $LCTL set_param fail_loc=0x1624
4552 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4553 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4554 do_facet mds1 $LCTL set_param fail_loc=0
4555 do_facet mds2 $LCTL set_param fail_loc=0
4557 cancel_lru_locks mdc
4558 cancel_lru_locks osc
4560 echo "Inject failure, to simulate the MDT0 fail to handle"
4561 echo "MDT1 LFSCK request during the first-stage scanning."
4562 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4563 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4565 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4566 $START_NAMESPACE -r -A ||
4567 error "(3) Fail to start LFSCK for namespace"
4569 wait_update_facet mds1 "$LCTL get_param -n \
4570 mdd.$(facet_svc mds1).lfsck_namespace |
4571 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4572 error "(4) mds1 is not the expected 'partial'"
4575 wait_update_facet mds2 "$LCTL get_param -n \
4576 mdd.$(facet_svc mds2).lfsck_namespace |
4577 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4578 error "(5) mds2 is not the expected 'completed'"
4581 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4583 local repaired=$(do_facet mds1 $LCTL get_param -n \
4584 mdd.$(facet_svc mds1).lfsck_namespace |
4585 awk '/^lost_dirent_repaired/ { print $2 }')
4586 [ $repaired -eq 0 ] ||
4587 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4589 repaired=$(do_facet mds2 $LCTL get_param -n \
4590 mdd.$(facet_svc mds2).lfsck_namespace |
4591 awk '/^lost_dirent_repaired/ { print $2 }')
4592 [ $repaired -eq 1 ] ||
4593 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4595 echo "Trigger namespace LFSCK on all devices again to cleanup"
4596 $START_NAMESPACE -r -A ||
4597 error "(8) Fail to start LFSCK for namespace"
4599 wait_all_targets_blocked namespace completed 9
4601 local repaired=$(do_facet mds1 $LCTL get_param -n \
4602 mdd.$(facet_svc mds1).lfsck_namespace |
4603 awk '/^lost_dirent_repaired/ { print $2 }')
4604 [ $repaired -eq 1 ] ||
4605 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4607 repaired=$(do_facet mds2 $LCTL get_param -n \
4608 mdd.$(facet_svc mds2).lfsck_namespace |
4609 awk '/^lost_dirent_repaired/ { print $2 }')
4610 [ $repaired -eq 0 ] ||
4611 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4613 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4616 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4617 skip "MDS older than 2.6.50, LU-5517"
4620 echo "The object's nlink attribute is larger than the object's known"
4621 echo "name entries count. The LFSCK will repair the object's nlink"
4622 echo "attribute to match the known name entries count"
4625 check_mount_and_prep
4627 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4628 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4630 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4631 echo "nlink attribute is larger than its name entries count."
4633 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4634 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4635 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4636 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4637 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4639 cancel_lru_locks mdc
4640 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4641 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4643 echo "Trigger namespace LFSCK to repair the nlink count"
4644 $START_NAMESPACE -r -A ||
4645 error "(5) Fail to start LFSCK for namespace"
4647 wait_all_targets_blocked namespace completed 6
4649 local repaired=$($SHOW_NAMESPACE |
4650 awk '/^nlinks_repaired/ { print $2 }')
4651 [ $repaired -eq 1 ] ||
4652 error "(7) Fail to repair nlink count: $repaired"
4654 cancel_lru_locks mdc
4655 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4656 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4658 # Disable 29a, we only allow nlink to be updated if the known linkEA
4659 # entries is larger than nlink count.
4661 #run_test 29a "LFSCK can repair bad nlink count (1)"
4664 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4665 skip "MDS older than 2.6.50, LU-5517"
4668 echo "The object's nlink attribute is smaller than the object's known"
4669 echo "name entries count. The LFSCK will repair the object's nlink"
4670 echo "attribute to match the known name entries count"
4673 check_mount_and_prep
4675 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4676 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4678 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4679 echo "nlink attribute is smaller than its name entries count."
4681 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4683 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4684 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4687 cancel_lru_locks mdc
4688 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4689 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4691 echo "Trigger namespace LFSCK to repair the nlink count"
4692 $START_NAMESPACE -r -A ||
4693 error "(5) Fail to start LFSCK for namespace"
4695 wait_all_targets_blocked namespace completed 6
4697 local repaired=$($SHOW_NAMESPACE |
4698 awk '/^nlinks_repaired/ { print $2 }')
4699 [ $repaired -eq 1 ] ||
4700 error "(7) Fail to repair nlink count: $repaired"
4702 cancel_lru_locks mdc
4703 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4704 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4706 run_test 29b "LFSCK can repair bad nlink count (2)"
4710 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4711 skip "MDS older than 2.6.50, LU-5517"
4714 echo "The namespace LFSCK will create many hard links to the target"
4715 echo "file as to exceed the linkEA size limitation. Under such case"
4716 echo "the linkEA will be marked as overflow that will prevent the"
4717 echo "target file to be migrated. Then remove some hard links to"
4718 echo "make the left hard links to be held within the linkEA size"
4719 echo "limitation. But before the namespace LFSCK adding all the"
4720 echo "missed linkEA entries back, the overflow mark (timestamp)"
4721 echo "will not be cleared."
4724 check_mount_and_prep
4726 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4727 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4728 error "(0.2) Fail to mkdir"
4729 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4730 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4732 # define MAX_LINKEA_SIZE 4096
4733 # sizeof(link_ea_header) = 24
4734 # sizeof(link_ea_entry) = 18
4735 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4736 # (sizeof(link_ea_entry) + name_length))
4737 # If the average name length is 12 bytes, then 150 hard links
4738 # is totally enough to overflow the linkEA
4739 echo "Create 150 hard links should succeed although the linkEA overflow"
4740 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4741 error "(2) Fail to hard link"
4743 cancel_lru_locks mdc
4744 if [ $MDSCOUNT -ge 2 ]; then
4745 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4746 error "(3.1) Migrate should fail"
4748 echo "The object with linkEA overflow should NOT be migrated"
4749 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4750 [ "$newfid" == "$oldfid" ] ||
4751 error "(3.2) Migrate should fail: $newfid != $oldfid"
4754 # Remove 100 hard links, then the linkEA should have space
4755 # to hold the missed linkEA entries.
4756 echo "Remove 100 hard links to save space for the missed linkEA entries"
4757 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4759 if [ $MDSCOUNT -ge 2 ]; then
4760 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4761 error "(5.1) Migrate should fail"
4763 # The overflow timestamp is still there, so migration will fail.
4764 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4765 [ "$newfid" == "$oldfid" ] ||
4766 error "(5.2) Migrate should fail: $newfid != $oldfid"
4769 # sleep 3 seconds to guarantee that the overflow is recognized
4772 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4773 $START_NAMESPACE -r -A ||
4774 error "(6) Fail to start LFSCK for namespace"
4776 wait_all_targets_blocked namespace completed 7
4778 local repaired=$($SHOW_NAMESPACE |
4779 awk '/^linkea_overflow_cleared/ { print $2 }')
4780 [ $repaired -eq 1 ] ||
4781 error "(8) Fail to clear linkea overflow: $repaired"
4783 repaired=$($SHOW_NAMESPACE |
4784 awk '/^nlinks_repaired/ { print $2 }')
4785 [ $repaired -eq 0 ] ||
4786 error "(9) Unexpected nlink repaired: $repaired"
4788 if [ $MDSCOUNT -ge 2 ]; then
4789 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4790 error "(10.1) Migrate failure"
4792 # Migration should succeed after clear the overflow timestamp.
4793 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4794 [ "$newfid" != "$oldfid" ] ||
4795 error "(10.2) Migrate should succeed"
4797 ls -l $DIR/$tdir/foo > /dev/null ||
4798 error "(11) 'ls' failed after migration"
4801 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4802 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4804 run_test 29c "verify linkEA size limitation"
4807 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4808 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4809 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4810 skip "MDS older than 2.6.50, LU-5518"
4813 echo "The namespace LFSCK will move the orphans from backend"
4814 echo "/lost+found directory to normal client visible namespace"
4815 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4818 check_mount_and_prep
4820 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4821 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4823 echo "Inject failure stub on MDT0 to simulate the case that"
4824 echo "directory d0 has no linkEA entry, then the LFSCK will"
4825 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4827 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4828 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4829 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4830 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4832 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4833 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4835 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4836 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4838 echo "Inject failure stub on MDT0 to simulate the case that the"
4839 echo "object's name entry will be removed, but not destroy the"
4840 echo "object. Then backend e2fsck will handle it as orphan and"
4841 echo "add them into the backend /lost+found directory."
4843 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4844 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4845 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4846 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4847 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4848 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4849 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4851 umount_client $MOUNT || error "(10) Fail to stop client!"
4853 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4855 local dev=$(facet_device $SINGLEMDS)
4857 echo "run e2fsck on $SINGLEMDS"
4858 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4859 error "(12) Fail to run e2fsck"
4861 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4863 echo "Trigger namespace LFSCK to recover backend orphans"
4864 $START_NAMESPACE -r -A ||
4865 error "(14) Fail to start LFSCK for namespace"
4867 wait_all_targets_blocked namespace completed 15
4869 local repaired=$($SHOW_NAMESPACE |
4870 awk '/^local_lost_found_moved/ { print $2 }')
4871 [ $repaired -ge 4 ] ||
4872 error "(16) Fail to recover backend orphans: $repaired"
4874 mount_client $MOUNT || error "(17) Fail to start client!"
4876 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4878 ls -ail $MOUNT/.lustre/lost+found/
4880 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4881 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4882 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4884 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4886 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4887 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4889 stat ${cname}/d1 || error "(21) d1 is not recovered"
4890 stat ${cname}/f1 || error "(22) f1 is not recovered"
4892 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4895 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4896 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4897 skip "MDS older than 2.6.50, LU-5519"
4900 echo "For the name entry under a striped directory, if the name"
4901 echo "hash does not match the shard, then the LFSCK will repair"
4902 echo "the bad name entry"
4905 check_mount_and_prep
4907 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4908 error "(1) Fail to create striped directory"
4910 echo "Inject failure stub on client to simulate the case that"
4911 echo "some name entry should be inserted into other non-first"
4912 echo "shard, but inserted into the first shard by wrong"
4914 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4915 $LCTL set_param fail_loc=0x1628 fail_val=0
4916 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4917 error "(2) Fail to create file under striped directory"
4918 $LCTL set_param fail_loc=0 fail_val=0
4920 echo "Trigger namespace LFSCK to repair bad name hash"
4921 $START_NAMESPACE -r -A ||
4922 error "(3) Fail to start LFSCK for namespace"
4924 wait_all_targets_blocked namespace completed 4
4926 local repaired=$($SHOW_NAMESPACE |
4927 awk '/^name_hash_repaired/ { print $2 }')
4928 [ $repaired -ge 1 ] ||
4929 error "(5) Fail to repair bad name hash: $repaired"
4931 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4933 error "Fail to find flag bad type: $rc"
4935 umount_client $MOUNT || error "(6) umount failed"
4936 mount_client $MOUNT || error "(7) mount failed"
4938 for ((i = 0; i < $MDSCOUNT; i++)); do
4939 stat $DIR/$tdir/striped_dir/d$i ||
4940 error "(8) Fail to stat d$i after LFSCK"
4941 rmdir $DIR/$tdir/striped_dir/d$i ||
4942 error "(9) Fail to unlink d$i after LFSCK"
4945 rmdir $DIR/$tdir/striped_dir ||
4946 error "(10) Fail to remove the striped directory after LFSCK"
4948 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4951 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4952 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4953 skip "MDS older than 2.6.50, LU-5519"
4956 echo "For the name entry under a striped directory, if the name"
4957 echo "hash does not match the shard, then the LFSCK will repair"
4958 echo "the bad name entry"
4961 check_mount_and_prep
4963 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4964 error "(1) Fail to create striped directory"
4966 echo "Inject failure stub on client to simulate the case that"
4967 echo "some name entry should be inserted into other non-second"
4968 echo "shard, but inserted into the secod shard by wrong"
4970 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4971 $LCTL set_param fail_loc=0x1628 fail_val=1
4972 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4973 error "(2) Fail to create file under striped directory"
4974 $LCTL set_param fail_loc=0 fail_val=0
4976 echo "Trigger namespace LFSCK to repair bad name hash"
4977 $START_NAMESPACE -r -A ||
4978 error "(3) Fail to start LFSCK for namespace"
4980 wait_all_targets_blocked namespace completed 4
4982 local repaired=$(do_facet mds2 $LCTL get_param -n \
4983 mdd.$(facet_svc mds2).lfsck_namespace |
4984 awk '/^name_hash_repaired/ { print $2 }')
4985 echo "repaired $repaired name entries with bad hash"
4986 [ $repaired -ge 1 ] ||
4987 error "(5) Fail to repair bad name hash: $repaired"
4989 umount_client $MOUNT || error "(6) umount failed"
4990 mount_client $MOUNT || error "(7) mount failed"
4992 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4993 stat $DIR/$tdir/striped_dir/d$i ||
4994 error "(8) Fail to stat d$i after LFSCK"
4995 rmdir $DIR/$tdir/striped_dir/d$i ||
4996 error "(9) Fail to unlink d$i after LFSCK"
4999 rmdir $DIR/$tdir/striped_dir ||
5000 error "(10) Fail to remove the striped directory after LFSCK"
5002 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5005 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5006 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5007 skip "MDS older than 2.6.50, LU-5519"
5010 echo "For some reason, the master MDT-object of the striped directory"
5011 echo "may lost its master LMV EA. If nobody created files under the"
5012 echo "master directly after the master LMV EA lost, then the LFSCK"
5013 echo "should re-generate the master LMV EA."
5016 check_mount_and_prep
5018 echo "Inject failure stub on MDT0 to simulate the case that the"
5019 echo "master MDT-object of the striped directory lost the LMV EA."
5021 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5022 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5023 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5024 error "(1) Fail to create striped directory"
5025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5027 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5028 $START_NAMESPACE -r -A ||
5029 error "(2) Fail to start LFSCK for namespace"
5031 wait_all_targets_blocked namespace completed 3
5033 local repaired=$($SHOW_NAMESPACE |
5034 awk '/^striped_dirs_repaired/ { print $2 }')
5035 [ $repaired -eq 1 ] ||
5036 error "(4) Fail to re-generate master LMV EA: $repaired"
5038 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5039 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5041 umount_client $MOUNT || error "(5) umount failed"
5042 mount_client $MOUNT || error "(6) mount failed"
5044 local empty=$(ls $DIR/$tdir/striped_dir/)
5045 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5047 rmdir $DIR/$tdir/striped_dir ||
5048 error "(8) Fail to remove the striped directory after LFSCK"
5050 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5053 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5054 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5055 skip "MDS older than 2.6.50, LU-5519"
5058 echo "For some reason, the master MDT-object of the striped directory"
5059 echo "may lost its master LMV EA. If somebody created files under the"
5060 echo "master directly after the master LMV EA lost, then the LFSCK"
5061 echo "should NOT re-generate the master LMV EA, instead, it should"
5062 echo "change the broken striped dirctory as read-only to prevent"
5063 echo "further damage"
5066 check_mount_and_prep
5068 echo "Inject failure stub on MDT0 to simulate the case that the"
5069 echo "master MDT-object of the striped directory lost the LMV EA."
5071 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5072 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5073 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5074 error "(1) Fail to create striped directory"
5075 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5077 umount_client $MOUNT || error "(2) umount failed"
5079 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5080 mount_client $MOUNT || error "(3) mount failed"
5082 touch $DIR/$tdir/striped_dir/dummy ||
5083 error "(4) Fail to touch under broken striped directory"
5085 echo "Trigger namespace LFSCK to find out the inconsistency"
5086 $START_NAMESPACE -r -A ||
5087 error "(5) Fail to start LFSCK for namespace"
5089 wait_all_targets_blocked namespace completed 6
5091 local repaired=$($SHOW_NAMESPACE |
5092 awk '/^striped_dirs_repaired/ { print $2 }')
5093 [ $repaired -eq 0 ] ||
5094 error "(7) Re-generate master LMV EA unexpected: $repaired"
5096 stat $DIR/$tdir/striped_dir/dummy ||
5097 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5099 touch $DIR/$tdir/striped_dir/foo &&
5100 error "(9) The broken striped directory should be read-only"
5102 chattr -i $DIR/$tdir/striped_dir ||
5103 error "(10) Fail to chattr on the broken striped directory"
5105 rm -f $DIR/$tdir/striped_dir/dummy || error "(11) Fail to remove dummy"
5107 # LFSCK again to regenerate master LMV
5108 echo "Trigger namespace LFSCK to find out the inconsistency"
5109 $START_NAMESPACE -r -A ||
5110 error "(12) Fail to start LFSCK for namespace"
5112 wait_all_targets_blocked namespace completed 6
5114 # reload striped_dir to parse newly generated LMV
5116 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
5118 rmdir $DIR/$tdir/striped_dir ||
5119 error "(13) Fail to remove the striped directory after LFSCK"
5121 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5124 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5125 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5126 skip "MDS older than 2.6.50, LU-5519"
5129 echo "For some reason, the slave MDT-object of the striped directory"
5130 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5131 echo "slave LMV EA."
5134 check_mount_and_prep
5136 echo "Inject failure stub on MDT0 to simulate the case that the"
5137 echo "slave MDT-object (that resides on the same MDT as the master"
5138 echo "MDT-object resides on) lost the LMV EA."
5140 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5142 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5143 error "(1) Fail to create striped directory"
5144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5146 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5147 $START_NAMESPACE -r -A ||
5148 error "(2) Fail to start LFSCK for namespace"
5150 wait_all_targets_blocked namespace completed 3
5152 local repaired=$($SHOW_NAMESPACE |
5153 awk '/^striped_shards_repaired/ { print $2 }')
5154 [ $repaired -eq 1 ] ||
5155 error "(4) Fail to re-generate slave LMV EA: $repaired"
5157 rmdir $DIR/$tdir/striped_dir ||
5158 error "(5) Fail to remove the striped directory after LFSCK"
5160 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5163 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5164 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5165 skip "MDS older than 2.6.50, LU-5519"
5168 echo "For some reason, the slave MDT-object of the striped directory"
5169 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5170 echo "slave LMV EA."
5173 check_mount_and_prep
5175 echo "Inject failure stub on MDT0 to simulate the case that the"
5176 echo "slave MDT-object (that resides on different MDT as the master"
5177 echo "MDT-object resides on) lost the LMV EA."
5179 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5180 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5181 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5182 error "(1) Fail to create striped directory"
5183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5185 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5186 $START_NAMESPACE -r -A ||
5187 error "(2) Fail to start LFSCK for namespace"
5189 wait_all_targets_blocked namespace completed 3
5191 local repaired=$(do_facet mds2 $LCTL get_param -n \
5192 mdd.$(facet_svc mds2).lfsck_namespace |
5193 awk '/^striped_shards_repaired/ { print $2 }')
5194 [ $repaired -eq 1 ] ||
5195 error "(4) Fail to re-generate slave LMV EA: $repaired"
5197 rmdir $DIR/$tdir/striped_dir ||
5198 error "(5) Fail to remove the striped directory after LFSCK"
5200 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5203 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5204 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5205 skip "MDS older than 2.6.50, LU-5519"
5208 echo "For some reason, the stripe index in the slave LMV EA is"
5209 echo "corrupted. The LFSCK should repair the slave LMV EA."
5212 check_mount_and_prep
5214 echo "Inject failure stub on MDT0 to simulate the case that the"
5215 echo "slave LMV EA on the first shard of the striped directory"
5216 echo "claims the same index as the second shard claims"
5218 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5220 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5221 error "(1) Fail to create striped directory"
5222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5224 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5225 $START_NAMESPACE -r -A ||
5226 error "(2) Fail to start LFSCK for namespace"
5228 wait_all_targets_blocked namespace completed 3
5230 local repaired=$($SHOW_NAMESPACE |
5231 awk '/^striped_shards_repaired/ { print $2 }')
5232 [ $repaired -eq 1 ] ||
5233 error "(4) Fail to repair slave LMV EA: $repaired"
5235 umount_client $MOUNT || error "(5) umount failed"
5236 mount_client $MOUNT || error "(6) mount failed"
5238 touch $DIR/$tdir/striped_dir/foo ||
5239 error "(7) Fail to touch file after the LFSCK"
5241 rm -f $DIR/$tdir/striped_dir/foo ||
5242 error "(8) Fail to unlink file after the LFSCK"
5244 rmdir $DIR/$tdir/striped_dir ||
5245 error "(9) Fail to remove the striped directory after LFSCK"
5247 run_test 31g "Repair the corrupted slave LMV EA"
5250 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5251 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5252 skip "MDS older than 2.6.50, LU-5519"
5255 echo "For some reason, the shard's name entry in the striped"
5256 echo "directory may be corrupted. The LFSCK should repair the"
5257 echo "bad shard's name entry."
5260 check_mount_and_prep
5262 echo "Inject failure stub on MDT0 to simulate the case that the"
5263 echo "first shard's name entry in the striped directory claims"
5264 echo "the same index as the second shard's name entry claims."
5266 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5268 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5269 error "(1) Fail to create striped directory"
5270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5272 echo "Trigger namespace LFSCK to repair the shard's name entry"
5273 $START_NAMESPACE -r -A ||
5274 error "(2) Fail to start LFSCK for namespace"
5276 wait_all_targets_blocked namespace completed 3
5278 local repaired=$($SHOW_NAMESPACE |
5279 awk '/^dirent_repaired/ { print $2 }')
5280 [ $repaired -eq 1 ] ||
5281 error "(4) Fail to repair shard's name entry: $repaired"
5283 umount_client $MOUNT || error "(5) umount failed"
5284 mount_client $MOUNT || error "(6) mount failed"
5286 touch $DIR/$tdir/striped_dir/foo ||
5287 error "(7) Fail to touch file after the LFSCK"
5289 rm -f $DIR/$tdir/striped_dir/foo ||
5290 error "(8) Fail to unlink file after the LFSCK"
5292 rmdir $DIR/$tdir/striped_dir ||
5293 error "(9) Fail to remove the striped directory after LFSCK"
5295 run_test 31h "Repair the corrupted shard's name entry"
5300 umount_client $MOUNT
5302 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5303 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5304 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5306 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5307 [ "$STATUS" == "scanning-phase1" ] ||
5308 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5311 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5317 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5319 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5320 error "(5) Fail to start ost1"
5322 run_test 32a "stop LFSCK when some OST failed"
5326 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5329 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5330 error "(1) Fail to create $DIR/$tdir/dp"
5331 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5332 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5333 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5334 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5335 umount_client $MOUNT
5337 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5338 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5339 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5341 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5342 mdd.${MDT_DEV}.lfsck_namespace |
5343 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5345 error "(5) unexpected status"
5349 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5355 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5357 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5358 error "(8) Fail to start MDT2"
5360 run_test 32b "stop LFSCK when some MDT failed"
5366 $START_LAYOUT --dryrun -o -r ||
5367 error "(1) Fail to start layout LFSCK"
5368 wait_all_targets_blocked layout completed 2
5370 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5371 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5372 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5374 $START_NAMESPACE -e abort -A -r ||
5375 error "(4) Fail to start namespace LFSCK"
5376 wait_all_targets_blocked namespace completed 5
5378 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5379 [ "$PARAMS" == "failout,all_targets" ] ||
5380 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5382 run_test 33 "check LFSCK paramters"
5386 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5387 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5391 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5393 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5394 error "(1) Fail to create $DIR/$tdir/dummy"
5396 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5397 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5398 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5399 mdd.${MDT_DEV}.lfsck_namespace |
5400 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5402 error "(3) unexpected status"
5405 local repaired=$($SHOW_NAMESPACE |
5406 awk '/^dirent_repaired/ { print $2 }')
5407 [ $repaired -eq 1 ] ||
5408 error "(4) Fail to repair the lost agent object: $repaired"
5410 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5412 mdd.${MDT_DEV}.lfsck_namespace |
5413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5415 error "(6) unexpected status"
5418 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5419 [ $repaired -eq 0 ] ||
5420 error "(7) Unexpected repairing: $repaired"
5422 run_test 34 "LFSCK can rebuild the lost agent object"
5426 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5430 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5431 do_facet mds2 $LCTL set_param fail_loc=0x1631
5432 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5433 error "(1) Fail to create $DIR/$tdir/dummy"
5436 do_facet mds2 $LCTL set_param fail_loc=0
5437 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5438 wait_update_facet mds2 "$LCTL get_param -n \
5439 mdd.$(facet_svc mds2).lfsck_namespace |
5440 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5441 error "(3) MDS${k} is not the expected 'completed'"
5443 local repaired=$(do_facet mds2 $LCTL get_param -n \
5444 mdd.$(facet_svc mds2).lfsck_namespace |
5445 awk '/^agent_entries_repaired/ { print $2 }')
5446 [ $repaired -eq 1 ] ||
5447 error "(4) Fail to repair the lost agent entry: $repaired"
5449 echo "stopall to cleanup object cache"
5452 setupall > /dev/null
5454 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5455 wait_update_facet mds2 "$LCTL get_param -n \
5456 mdd.$(facet_svc mds2).lfsck_namespace |
5457 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5458 error "(6) MDS${k} is not the expected 'completed'"
5460 repaired=$(do_facet mds2 $LCTL get_param -n \
5461 mdd.$(facet_svc mds2).lfsck_namespace |
5462 awk '/^agent_entries_repaired/ { print $2 }')
5463 [ $repaired -eq 0 ] ||
5464 error "(7) Unexpected repairing: $repaired"
5466 run_test 35 "LFSCK can rebuild the lost agent entry"
5469 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5472 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5473 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5474 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5477 check_mount_and_prep
5481 lctl get_param osc.*.*grant*
5482 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5484 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5485 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5486 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5487 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5488 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5489 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5490 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5491 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5492 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5493 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5494 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5495 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5497 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5498 error "(3) Fail to write $DIR/$tdir/f0"
5499 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5500 error "(4) Fail to write $DIR/$tdir/f1"
5501 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5502 error "(5) Fail to write $DIR/$tdir/f2"
5504 $LFS mirror resync $DIR/$tdir/f0 ||
5505 error "(6) Fail to resync $DIR/$tdir/f0"
5506 $LFS mirror resync $DIR/$tdir/f1 ||
5507 error "(7) Fail to resync $DIR/$tdir/f1"
5508 $LFS mirror resync $DIR/$tdir/f2 ||
5509 error "(8) Fail to resync $DIR/$tdir/f2"
5511 cancel_lru_locks mdc
5512 cancel_lru_locks osc
5514 $LFS getstripe $DIR/$tdir/f0 ||
5515 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5516 $LFS getstripe $DIR/$tdir/f1 ||
5517 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5518 $LFS getstripe $DIR/$tdir/f2 ||
5519 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5521 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5522 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5523 do_facet mds1 $LCTL set_param fail_loc=0x1616
5525 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5526 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5527 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5528 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5529 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5530 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5534 do_facet mds1 $LCTL set_param fail_loc=0
5536 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5537 error "(15) The 1st of mirror is not destroyed"
5538 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5539 error "(16) The 2nd of mirror is not destroyed"
5540 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5541 error "(17) The 3rd of mirror is not destroyed"
5545 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5546 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5547 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5548 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5549 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5550 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5552 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5553 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5555 for k in $(seq $MDSCOUNT); do
5556 # The LFSCK status query internal is 30 seconds. For the case
5557 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5558 # time to guarantee the status sync up.
5559 wait_update_facet mds${k} "$LCTL get_param -n \
5560 mdd.$(facet_svc mds${k}).lfsck_layout |
5561 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5562 error "(22) MDS${k} is not the expected 'completed'"
5565 for k in $(seq $OSTCOUNT); do
5566 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5567 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5568 awk '/^status/ { print $2 }')
5569 [ "$cur_status" == "completed" ] ||
5570 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5573 local repaired=$(do_facet mds1 $LCTL get_param -n \
5574 mdd.$(facet_svc mds1).lfsck_layout |
5575 awk '/^repaired_orphan/ { print $2 }')
5576 [ $repaired -eq 9 ] ||
5577 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5579 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5580 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5581 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5582 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5583 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5584 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5586 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5587 $LFS getstripe $DIR/$tdir/f0
5588 error "(28) The 1st of mirror is not recovered"
5591 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5592 $LFS getstripe $DIR/$tdir/f1
5593 error "(29) The 2nd of mirror is not recovered"
5596 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5597 $LFS getstripe $DIR/$tdir/f2
5598 error "(30) The 3rd of mirror is not recovered"
5601 run_test 36a "rebuild LOV EA for mirrored file (1)"
5604 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5605 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5608 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5609 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5610 echo "with the PFID EA of related OST-object(s) belong to the file. "
5613 check_mount_and_prep
5615 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5616 -N -E 2M -S1M -o 1,2 -E -1 -o 0 \
5617 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5618 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5620 local fid=$($LFS path2fid $DIR/$tdir/f0)
5622 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5623 error "(1) Fail to write $DIR/$tdir/f0"
5624 $LFS mirror resync $DIR/$tdir/f0 ||
5625 error "(2) Fail to resync $DIR/$tdir/f0"
5627 cancel_lru_locks mdc
5628 cancel_lru_locks osc
5630 $LFS getstripe $DIR/$tdir/f0 ||
5631 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5633 echo "Inject failure, to simulate the case of missing the MDT-object"
5634 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5635 do_facet mds1 $LCTL set_param fail_loc=0x1616
5636 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5640 do_facet mds1 $LCTL set_param fail_loc=0
5642 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5643 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5645 for k in $(seq $MDSCOUNT); do
5646 # The LFSCK status query internal is 30 seconds. For the case
5647 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5648 # time to guarantee the status sync up.
5649 wait_update_facet mds${k} "$LCTL get_param -n \
5650 mdd.$(facet_svc mds${k}).lfsck_layout |
5651 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5652 error "(6) MDS${k} is not the expected 'completed'"
5655 for k in $(seq $OSTCOUNT); do
5656 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5657 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5658 awk '/^status/ { print $2 }')
5659 [ "$cur_status" == "completed" ] ||
5660 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5663 local count=$(do_facet mds1 $LCTL get_param -n \
5664 mdd.$(facet_svc mds1).lfsck_layout |
5665 awk '/^repaired_orphan/ { print $2 }')
5666 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5668 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5669 count=$($LFS getstripe --mirror-count $name)
5670 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5672 count=$($LFS getstripe --component-count $name)
5673 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5675 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5676 $LFS getstripe $name
5677 error "(11) The 1st of mirror is not recovered"
5680 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5681 $LFS getstripe $name
5682 error "(12) The 2nd of mirror is not recovered"
5685 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5686 $LFS getstripe $name
5687 error "(13) The 3rd of mirror is not recovered"
5690 run_test 36b "rebuild LOV EA for mirrored file (2)"
5693 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5694 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5697 echo "The mirrored file has been modified, not resynced yet, then "
5698 echo "lost its MDT-object, but relatd OST-objects are still there. "
5699 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5700 echo "with the PFID EA of related OST-object(s) belong to the file. "
5703 check_mount_and_prep
5705 $LFS setstripe -N -E 2M -S1M -o 0,1 -E -1 -o 2 \
5706 -N -E 2M -S1M -o 1,2 -E -1 -o 0 $DIR/$tdir/f0 ||
5707 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5709 local fid=$($LFS path2fid $DIR/$tdir/f0)
5711 # The 1st dd && resync makes all related OST-objects have been written
5712 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5713 error "(1.1) Fail to write $DIR/$tdir/f0"
5714 $LFS mirror resync $DIR/$tdir/f0 ||
5715 error "(1.2) Fail to resync $DIR/$tdir/f0"
5716 # The 2nd dd makes one mirror to be stale
5717 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5718 error "(1.3) Fail to write $DIR/$tdir/f0"
5720 cancel_lru_locks mdc
5721 cancel_lru_locks osc
5723 $LFS getstripe $DIR/$tdir/f0 ||
5724 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5726 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5727 awk '/lcme_flags/ { print $2 }')
5728 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5729 awk '/lcme_flags/ { print $2 }')
5731 echo "Inject failure, to simulate the case of missing the MDT-object"
5732 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5733 do_facet mds1 $LCTL set_param fail_loc=0x1616
5734 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5738 do_facet mds1 $LCTL set_param fail_loc=0
5740 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5741 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5743 for k in $(seq $MDSCOUNT); do
5744 # The LFSCK status query internal is 30 seconds. For the case
5745 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5746 # time to guarantee the status sync up.
5747 wait_update_facet mds${k} "$LCTL get_param -n \
5748 mdd.$(facet_svc mds${k}).lfsck_layout |
5749 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5750 error "(5) MDS${k} is not the expected 'completed'"
5753 for k in $(seq $OSTCOUNT); do
5754 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5755 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5756 awk '/^status/ { print $2 }')
5757 [ "$cur_status" == "completed" ] ||
5758 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5761 local count=$(do_facet mds1 $LCTL get_param -n \
5762 mdd.$(facet_svc mds1).lfsck_layout |
5763 awk '/^repaired_orphan/ { print $2 }')
5764 [ $count -eq 6 ] || error "(7) Expect 6 fixed on mds1, but got: $count"
5766 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5767 count=$($LFS getstripe --mirror-count $name)
5768 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5770 count=$($LFS getstripe --component-count $name)
5771 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5773 local flags=$($LFS getstripe $name | head -n 10 |
5774 awk '/lcme_flags/ { print $2 }')
5775 [ "$flags" == "$saved_flags1" ] || {
5776 $LFS getstripe $name
5777 error "(10) expect flags $saved_flags1, got $flags"
5780 flags=$($LFS getstripe $name | tail -n 10 |
5781 awk '/lcme_flags/ { print $2 }')
5782 [ "$flags" == "$saved_flags2" ] || {
5783 $LFS getstripe $name
5784 error "(11) expect flags $saved_flags2, got $flags"
5787 run_test 36c "rebuild LOV EA for mirrored file (3)"
5793 local t_dir="$DIR/$tdir/d0"
5794 check_mount_and_prep
5796 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5797 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5801 $START_NAMESPACE -r -A || {
5802 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5804 wait_all_targets_blocked namespace completed 4
5809 run_test 37 "LFSCK must skip a ORPHAN"
5813 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5814 skip "Need MDS version newer than 2.12.51"
5816 test_mkdir $DIR/$tdir
5817 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5818 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5820 # create foreign file
5821 $LFS setstripe --foreign=none --flags 0xda05 \
5822 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5823 error "$DIR/$tdir/$tfile: create failed"
5825 $LFS getstripe -v $DIR/$tdir/$tfile |
5826 grep "lfm_magic:.*0x0BD70BD0" ||
5827 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5828 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5829 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5830 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5831 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5832 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5833 $LFS getstripe -v $DIR/$tdir/$tfile |
5834 grep "lfm_flags:.*0x0000DA05" ||
5835 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5836 $LFS getstripe $DIR/$tdir/$tfile |
5837 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5838 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5840 # modify striping should fail
5841 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5842 error "$DIR/$tdir/$tfile: setstripe should fail"
5844 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5846 wait_all_targets_blocked namespace completed 1
5848 # check that "global" namespace_repaired == 0 !!!
5849 local repaired=$(do_facet mds1 \
5850 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5851 awk '/^namespace_repaired/ { print \\\$2 }'")
5852 [ $repaired -eq 0 ] ||
5853 error "(2) Expect no namespace repair, but got: $repaired"
5855 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5857 wait_all_targets_blocked layout completed 2
5859 # check that "global" layout_repaired == 0 !!!
5860 local repaired=$(do_facet mds1 \
5861 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5862 awk '/^layout_repaired/ { print \\\$2 }'")
5863 [ $repaired -eq 0 ] ||
5864 error "(2) Expect no layout repair, but got: $repaired"
5866 echo "post-lfsck checks of foreign file"
5868 $LFS getstripe -v $DIR/$tdir/$tfile |
5869 grep "lfm_magic:.*0x0BD70BD0" ||
5870 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5871 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5872 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5873 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5874 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5875 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5876 $LFS getstripe -v $DIR/$tdir/$tfile |
5877 grep "lfm_flags:.*0x0000DA05" ||
5878 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5879 $LFS getstripe $DIR/$tdir/$tfile |
5880 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5881 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5883 # modify striping should fail
5884 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5885 error "$DIR/$tdir/$tfile: setstripe should fail"
5888 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5889 cat /etc/passwd > $DIR/$tdir/$tfile &&
5890 error "$DIR/$tdir/$tfile: write should fail"
5892 #remove foreign file
5893 rm $DIR/$tdir/$tfile ||
5894 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5896 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5900 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5901 skip "Need MDS version newer than 2.12.51"
5903 test_mkdir $DIR/$tdir
5904 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5905 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5907 # create foreign dir
5908 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5909 $DIR/$tdir/${tdir}2 ||
5910 error "$DIR/$tdir/${tdir}2: create failed"
5912 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5913 grep "lfm_magic:.*0x0CD50CD0" ||
5914 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5915 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5916 # - sizeof(lfm_type) - sizeof(lfm_flags)
5917 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5918 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5919 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5920 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5921 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5922 grep "lfm_flags:.*0x0000DA05" ||
5923 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5924 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5925 grep "lfm_value.*${uuid1}@${uuid2}" ||
5926 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5928 # file create in dir should fail
5929 touch $DIR/$tdir/${tdir}2/$tfile &&
5930 "$DIR/${tdir}2: file create should fail"
5933 chmod 777 $DIR/$tdir/${tdir}2 ||
5934 error "$DIR/${tdir}2: chmod failed"
5937 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5938 error "$DIR/${tdir}2: chown failed"
5940 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5942 wait_all_targets_blocked namespace completed 1
5944 # check that "global" namespace_repaired == 0 !!!
5945 local repaired=$(do_facet mds1 \
5946 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5947 awk '/^namespace_repaired/ { print \\\$2 }'")
5948 [ $repaired -eq 0 ] ||
5949 error "(2) Expect nothing to be repaired, but got: $repaired"
5951 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5953 wait_all_targets_blocked layout completed 2
5955 # check that "global" layout_repaired == 0 !!!
5956 local repaired=$(do_facet mds1 \
5957 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5958 awk '/^layout_repaired/ { print \\\$2 }'")
5959 [ $repaired -eq 0 ] ||
5960 error "(2) Expect no layout repair, but got: $repaired"
5962 echo "post-lfsck checks of foreign dir"
5964 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5965 grep "lfm_magic:.*0x0CD50CD0" ||
5966 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5967 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5968 # - sizeof(lfm_type) - sizeof(lfm_flags)
5969 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5970 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5971 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5972 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5973 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5974 grep "lfm_flags:.*0x0000DA05" ||
5975 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5976 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5977 grep "lfm_value.*${uuid1}@${uuid2}" ||
5978 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5980 # file create in dir should fail
5981 touch $DIR/$tdir/${tdir}2/$tfile &&
5982 "$DIR/${tdir}2: file create should fail"
5985 chmod 777 $DIR/$tdir/${tdir}2 ||
5986 error "$DIR/${tdir}2: chmod failed"
5989 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5990 error "$DIR/${tdir}2: chown failed"
5993 rmdir $DIR/$tdir/${tdir}2 ||
5994 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5996 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5999 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6001 check_mount_and_prep
6002 $LFS mkdir -i 1 $DIR/$tdir/dir1
6003 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6005 touch $DIR/$tdir/dir1/f1
6006 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6008 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6009 $LFS migrate -m 0 $DIR/$tdir/dir1
6011 echo "trigger LFSCK for layout"
6012 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6014 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6015 mdd.${MDT_DEV}.lfsck_layout |
6016 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6018 error "(2) unexpected status"
6021 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6023 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6025 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6029 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6031 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6032 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6033 do_facet $SINGLEMDS $LCTL dk > /dev/null
6035 echo "trigger LFSCK for SEL layout"
6036 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6037 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6038 mdd.${MDT_DEV}.lfsck_layout |
6039 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6041 error "(2) unexpected status"
6044 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6045 grep "lfsck_layout_verify_header")
6047 [[ "x$errors" == "x" ]] || {
6049 error "lfsck failed"
6052 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6054 run_test 41 "SEL support in LFSCK"
6057 local mode='\x00\x00\x00\x00'
6058 local raw="$(printf ""\\\\x%02x"" {0..63})"
6062 [[ $(lscpu) =~ Byte\ Order.*Little ]] && size='\x40\x00\x00\x00' ||
6063 size='\x00\x00\x00\x40'
6064 key="${mode}${raw}${size}"
6065 echo -n -e "${key}" | keyctl padd logon fscrypt:4242424242424242 @s
6070 sync ; echo 3 > /proc/sys/vm/drop_caches
6077 $LCTL set_param -n ldlm.namespaces.*.lru_size=clear
6078 sync ; echo 3 > /proc/sys/vm/drop_caches
6079 dummy_key=$(keyctl show | awk '$7 ~ "^fscrypt:" {print $1}')
6080 if [ -n "$dummy_key" ]; then
6081 keyctl revoke $dummy_key
6086 remount_client_normally() {
6087 # remount client without dummy encryption key
6088 if is_mounted $MOUNT; then
6089 umount_client $MOUNT || error "umount $MOUNT failed"
6091 mount_client $MOUNT ${MOUNT_OPTS} ||
6092 error "remount failed"
6094 if is_mounted $MOUNT2; then
6095 umount_client $MOUNT2 || error "umount $MOUNT2 failed"
6097 if [ "$MOUNT_2" ]; then
6098 mount_client $MOUNT2 ${MOUNT_OPTS} ||
6099 error "remount failed"
6105 remount_client_dummykey() {
6108 # remount client with dummy encryption key
6109 if is_mounted $MOUNT; then
6110 umount_client $MOUNT || error "umount $MOUNT failed"
6112 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6113 error "remount failed"
6116 setup_for_enc_tests() {
6117 rm -rf $DIR/[df][0-9]* || error "Fail to cleanup env"
6119 # remount client with test_dummy_encryption option
6120 if is_mounted $MOUNT; then
6121 umount_client $MOUNT || error "umount $MOUNT failed"
6123 mount_client $MOUNT ${MOUNT_OPTS},test_dummy_encryption ||
6124 error "mount with '-o test_dummy_encryption' failed"
6126 # this directory will be encrypted, because of dummy mode
6127 $LFS setdirstripe -c 1 -i 0 $DIR/$tdir
6128 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6131 cleanup_for_enc_tests() {
6132 rm -rf $DIR/$tdir $*
6134 remount_client_normally
6138 [[ $(facet_fstype ost1) == zfs ]] && skip "skip ZFS backend"
6140 (( $MDS1_VERSION > $(version_code 2.15.51) )) ||
6141 skip "Need MDS version at least 2.15.51"
6144 echo "If the MDT-object has the encryption flag but the OST-object"
6145 echo "does not, add it to the OST-object."
6148 check_mount_and_prep
6150 $LCTL get_param mdc.*.import | grep -q client_encryption ||
6151 skip "client encryption not supported"
6153 mount.lustre --help |& grep -q "test_dummy_encryption:" ||
6154 skip "need dummy encryption support"
6156 stack_trap cleanup_for_enc_tests EXIT
6159 $LFS setstripe -c 1 -i 0 $DIR/$tdir
6160 touch $DIR/$tdir/${tfile}_1 || error "touch ${tfile}_1 failed"
6161 dd if=/dev/zero of=$DIR/$tdir/${tfile}_2 bs=1 count=1 conv=fsync ||
6162 error "dd ${tfile}_2 failed"
6164 #define OBD_FAIL_LFSCK_NO_ENCFLAG 0x1632
6165 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x1632"
6166 touch $DIR/$tdir/${tfile}_3 || error "touch ${tfile}_3 failed"
6167 dd if=/dev/zero of=$DIR/$tdir/${tfile}_4 bs=1 count=1 conv=fsync ||
6168 error "dd ${tfile}_4 failed"
6169 do_nodes $(comma_list $(all_nodes)) "$LCTL set_param fail_loc=0x0"
6170 cancel_lru_locks osc
6172 echo "Trigger layout LFSCK to find out inconsistent OST-object enc flag"
6174 $START_LAYOUT -r || error "Fail to start LFSCK for layout!"
6176 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6177 mdd.${MDT_DEV}.lfsck_layout |
6178 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6180 error "unexpected lfsck status"
6183 local repaired=$($SHOW_LAYOUT |
6184 awk '/^repaired_others/ { print $2 }')
6185 [ $repaired -eq 2 ] ||
6186 error "Fail to repair inconsistent enc flag: $repaired"
6188 run_test 42 "LFSCK can repair inconsistent MDT-object/OST-object encryption flags"
6190 # restore MDS/OST size
6191 MDSSIZE=${SAVED_MDSSIZE}
6192 OSTSIZE=${SAVED_OSTSIZE}
6193 OSTCOUNT=${SAVED_OSTCOUNT}
6195 # cleanup the system at last
6196 REFORMAT="yes" cleanup_and_setup_lustre
6198 complete_test $SECONDS
6199 check_and_cleanup_lustre