3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
21 # bug number for skipped test: LU-4165
24 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
25 # bug number for skipped test: LU-1267
27 # bug number for skipped test: LU-3950
29 # bug number for skipped test: LU-3593
31 # bug number for skipped test: LU-3590
33 # bug number for skipped test: LU-3591
35 # bug number for skipped test: LU-3594 LU-3594
36 ALWAYS_EXCEPT+=" 16 17"
37 # bug number for skipped test: LU-3336 LU-3336 LU-3336 LU-3336 LU-3336
38 ALWAYS_EXCEPT+=" 18a 18b 18c 18d 18e"
39 # bug number for skipped test: LU-3951 LU-3951
40 ALWAYS_EXCEPT+=" 19a 19b"
41 # bug number for skipped test: LU-4887 LU-4887
42 ALWAYS_EXCEPT+=" 20 21"
44 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
45 # bug number for skipped test: LU-4788
47 # bug number for skipped test: LU-5511 LU-5511 LU-5511
48 ALWAYS_EXCEPT+=" 2e 22a 22b"
49 # bug number for skipped test: LU-4788
51 # bug number for skipped test: LU-5512 LU-5512 LU-5512
52 ALWAYS_EXCEPT+=" 23a 23b 23c"
53 # bug number for skipped test: LU-5513
55 # bug number for skipped test: LU-5515
57 # bug number for skipped test: LU-5516 LU-5516
58 ALWAYS_EXCEPT+=" 26a 26b"
59 # bug number for skipped test: LU-5516 LU-5516
60 ALWAYS_EXCEPT+=" 27a 27b"
61 # bug number for skipped test: LU-5506
63 # bug number for skipped test: LU-5517 LU-5517 LU-5517
64 ALWAYS_EXCEPT+=" 29a 29b 29c"
65 # bug number for skipped test: LU-5518
68 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
71 require_dsh_mds || exit 0
75 if ! check_versions; then
76 skip "It is NOT necessary to test lfsck under interoperation mode"
80 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
81 skip "Need MDS version at least 2.3.60" && exit 0
85 SAVED_MDSSIZE=${MDSSIZE}
86 SAVED_OSTSIZE=${OSTSIZE}
87 SAVED_OSTCOUNT=${OSTCOUNT}
88 # use small MDS + OST size to speed formatting time
89 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
91 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
93 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
95 # no need too many OSTs, to reduce the format/start/stop overhead
97 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
99 # build up a clean test environment.
100 REFORMAT="yes" check_and_setup_lustre
102 MDT_DEV="${FSNAME}-MDT0000"
103 OST_DEV="${FSNAME}-OST0000"
104 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
105 START_NAMESPACE="do_facet $SINGLEMDS \
106 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
107 START_LAYOUT="do_facet $SINGLEMDS \
108 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
109 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
110 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
111 SHOW_NAMESPACE="do_facet $SINGLEMDS \
112 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
113 SHOW_LAYOUT="do_facet $SINGLEMDS \
114 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
115 SHOW_LAYOUT_ON_OST="do_facet ost1 \
116 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
117 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
118 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
119 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
128 echo "preparing... $nfiles * $ndirs files will be created $(date)."
129 if [ ! -z $igif ]; then
130 #define OBD_FAIL_FID_IGIF 0x1504
131 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
134 cp $LUSTRE/tests/*.sh $DIR/$tdir/
135 if [ $ndirs -gt 0 ]; then
136 createmany -d $DIR/$tdir/d $ndirs
137 createmany -m $DIR/$tdir/f $ndirs
138 if [ $nfiles -gt 0 ]; then
139 for ((i = 0; i < $ndirs; i++)); do
140 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
141 /dev/null || error "createmany $nfiles"
144 createmany -d $DIR/$tdir/e $ndirs
147 if [ ! -z $igif ]; then
148 touch $DIR/$tdir/dummy
149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
152 echo "prepared $(date)."
155 run_e2fsck_on_mdt0() {
156 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
158 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
159 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
161 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
162 error "(2) Detected inconsistency on MDT0"
164 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
165 error "(3) Fail to start MDT0"
168 wait_all_targets_blocked() {
173 local count=$(do_facet mds1 \
174 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
175 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
176 [[ $count -eq $MDSCOUNT ]] || {
177 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
178 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
187 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
188 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
189 "$MDSCOUNT" $LTIME || {
190 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
191 error "($err) some MDTs are not in ${status}"
198 #define OBD_FAIL_LFSCK_DELAY1 0x1600
199 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
200 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
202 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
204 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
205 [ "$STATUS" == "scanning-phase1" ] ||
206 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
208 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
210 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
211 [ "$STATUS" == "stopped" ] ||
212 error "(6) Expect 'stopped', but got '$STATUS'"
214 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
216 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
217 [ "$STATUS" == "scanning-phase1" ] ||
218 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
221 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
222 mdd.${MDT_DEV}.lfsck_namespace |
223 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
225 error "(9) unexpected status"
228 local repaired=$($SHOW_NAMESPACE |
229 awk '/^updated_phase1/ { print $2 }')
230 [ $repaired -eq 0 ] ||
231 error "(10) Expect nothing to be repaired, but got: $repaired"
233 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
234 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
235 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
236 mdd.${MDT_DEV}.lfsck_namespace |
237 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
239 error "(12) unexpected status"
242 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
243 [ $((scanned1 + 1)) -eq $scanned2 ] ||
244 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
246 echo "stopall, should NOT crash LU-3649"
247 stopall || error "(14) Fail to stopall"
249 run_test 0 "Control LFSCK manually"
254 #define OBD_FAIL_FID_INDIR 0x1501
255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
256 touch $DIR/$tdir/dummy
258 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
260 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
261 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
262 mdd.${MDT_DEV}.lfsck_namespace |
263 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
265 error "(4) unexpected status"
268 local repaired=$($SHOW_NAMESPACE |
269 awk '/^dirent_repaired/ { print $2 }')
270 # for interop with old server
271 [ -z "$repaired" ] &&
272 repaired=$($SHOW_NAMESPACE |
273 awk '/^updated_phase1/ { print $2 }')
275 [ $repaired -eq 1 ] ||
276 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
280 mount_client $MOUNT || error "(6) Fail to start client!"
282 #define OBD_FAIL_FID_LOOKUP 0x1505
283 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
284 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
286 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
288 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
292 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
293 skip "OI Scrub not implemented for ZFS" && return
297 #define OBD_FAIL_FID_INLMA 0x1502
298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
299 touch $DIR/$tdir/dummy
301 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
303 #define OBD_FAIL_FID_NOLMA 0x1506
304 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
305 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
306 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
307 mdd.${MDT_DEV}.lfsck_namespace |
308 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
310 error "(4) unexpected status"
313 local repaired=$($SHOW_NAMESPACE |
314 awk '/^dirent_repaired/ { print $2 }')
315 # for interop with old server
316 [ -z "$repaired" ] &&
317 repaired=$($SHOW_NAMESPACE |
318 awk '/^updated_phase1/ { print $2 }')
320 [ $repaired -eq 1 ] ||
321 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
326 mount_client $MOUNT || error "(6) Fail to start client!"
328 #define OBD_FAIL_FID_LOOKUP 0x1505
329 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
330 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
332 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
334 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
339 #define OBD_FAIL_FID_IGIF 0x1504
340 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
341 touch $DIR/$tdir/dummy
343 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
345 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
346 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
347 mdd.${MDT_DEV}.lfsck_namespace |
348 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
350 error "(4) unexpected status"
353 local repaired=$($SHOW_NAMESPACE |
354 awk '/^dirent_repaired/ { print $2 }')
355 # for interop with old server
356 [ -z "$repaired" ] &&
357 repaired=$($SHOW_NAMESPACE |
358 awk '/^updated_phase1/ { print $2 }')
360 [ $repaired -eq 1 ] ||
361 error "(5) Fail to repair lost FID-in-dirent: $repaired"
365 mount_client $MOUNT || error "(6) Fail to start client!"
367 #define OBD_FAIL_FID_LOOKUP 0x1505
368 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
369 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
371 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
373 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
378 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
380 touch $DIR/$tdir/dummy
382 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
384 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
385 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
386 mdd.${MDT_DEV}.lfsck_namespace |
387 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
389 error "(4) unexpected status"
392 local repaired=$($SHOW_NAMESPACE |
393 awk '/^linkea_repaired/ { print $2 }')
394 # for interop with old server
395 [ -z "$repaired" ] &&
396 repaired=$($SHOW_NAMESPACE |
397 awk '/^updated_phase2/ { print $2 }')
399 [ $repaired -eq 1 ] ||
400 error "(5) Fail to repair crashed linkEA: $repaired"
404 mount_client $MOUNT || error "(6) Fail to start client!"
406 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
407 error "(7) Fail to stat $DIR/$tdir/dummy"
409 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
410 local dummyname=$($LFS fid2path $DIR $dummyfid)
411 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
412 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
414 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
420 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
421 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
422 touch $DIR/$tdir/dummy
424 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
426 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
427 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
428 mdd.${MDT_DEV}.lfsck_namespace |
429 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
431 error "(4) unexpected status"
434 local repaired=$($SHOW_NAMESPACE |
435 awk '/^updated_phase2/ { print $2 }')
436 [ $repaired -eq 1 ] ||
437 error "(5) Fail to repair crashed linkEA: $repaired"
441 mount_client $MOUNT || error "(6) Fail to start client!"
443 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
444 error "(7) Fail to stat $DIR/$tdir/dummy"
446 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
447 local dummyname=$($LFS fid2path $DIR $dummyfid)
448 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
449 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
451 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
457 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
458 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
459 touch $DIR/$tdir/dummy
461 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
463 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
464 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
465 mdd.${MDT_DEV}.lfsck_namespace |
466 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
468 error "(4) unexpected status"
471 local repaired=$($SHOW_NAMESPACE |
472 awk '/^updated_phase2/ { print $2 }')
473 [ $repaired -eq 1 ] ||
474 error "(5) Fail to repair crashed linkEA: $repaired"
478 mount_client $MOUNT || error "(6) Fail to start client!"
480 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
481 error "(7) Fail to stat $DIR/$tdir/dummy"
483 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
484 local dummyname=$($LFS fid2path $DIR $dummyfid)
485 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
486 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
488 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
494 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
495 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
496 touch $DIR/$tdir/dummy
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
500 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
502 mdd.${MDT_DEV}.lfsck_namespace |
503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
505 error "(4) unexpected status"
508 local repaired=$($SHOW_NAMESPACE |
509 awk '/^linkea_repaired/ { print $2 }')
510 [ $repaired -eq 1 ] ||
511 error "(5) Fail to repair crashed linkEA: $repaired"
515 mount_client $MOUNT || error "(6) Fail to start client!"
517 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
518 error "(7) Fail to stat $DIR/$tdir/dummy"
520 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
521 local dummyname=$($LFS fid2path $DIR $dummyfid)
522 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
523 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
525 run_test 2d "LFSCK can recover the missing linkEA entry"
529 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
533 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
540 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
542 wait_all_targets_blocked namespace completed 4
544 local repaired=$($SHOW_NAMESPACE |
545 awk '/^linkea_repaired/ { print $2 }')
546 [ $repaired -eq 1 ] ||
547 error "(5) Fail to repair crashed linkEA: $repaired"
549 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
550 local name=$($LFS fid2path $DIR $fid)
551 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
552 error "(6) Fail to repair linkEA: $fid $name"
554 run_test 2e "namespace LFSCK can verify remote object linkEA"
560 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
561 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
562 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
564 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
565 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
566 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
568 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
570 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
572 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
574 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
576 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
578 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
579 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
580 mdd.${MDT_DEV}.lfsck_namespace |
581 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
583 error "(10) unexpected status"
586 local checked=$($SHOW_NAMESPACE |
587 awk '/^checked_phase2/ { print $2 }')
588 [ $checked -ge 4 ] ||
589 error "(11) Fail to check multiple-linked object: $checked"
591 local repaired=$($SHOW_NAMESPACE |
592 awk '/^multiple_linked_repaired/ { print $2 }')
593 [ $repaired -ge 2 ] ||
594 error "(12) Fail to repair multiple-linked object: $repaired"
596 run_test 3 "LFSCK can verify multiple-linked objects"
600 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
601 skip "OI Scrub not implemented for ZFS" && return
604 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
605 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
607 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
608 echo "start $SINGLEMDS with disabling OI scrub"
609 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
610 error "(2) Fail to start MDS!"
612 #define OBD_FAIL_LFSCK_DELAY2 0x1601
613 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
614 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
615 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
616 mdd.${MDT_DEV}.lfsck_namespace |
617 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
619 error "(5) unexpected status"
622 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
623 [ "$STATUS" == "scanning-phase1" ] ||
624 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
626 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
627 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
628 mdd.${MDT_DEV}.lfsck_namespace |
629 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
631 error "(7) unexpected status"
634 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
635 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
637 local repaired=$($SHOW_NAMESPACE |
638 awk '/^dirent_repaired/ { print $2 }')
639 # for interop with old server
640 [ -z "$repaired" ] &&
641 repaired=$($SHOW_NAMESPACE |
642 awk '/^updated_phase1/ { print $2 }')
644 [ $repaired -ge 9 ] ||
645 error "(9) Fail to re-generate FID-in-dirent: $repaired"
649 mount_client $MOUNT || error "(10) Fail to start client!"
651 #define OBD_FAIL_FID_LOOKUP 0x1505
652 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
653 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
654 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
656 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
660 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
661 skip "OI Scrub not implemented for ZFS" && return
664 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
665 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
667 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
668 echo "start $SINGLEMDS with disabling OI scrub"
669 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
670 error "(2) Fail to start MDS!"
672 #define OBD_FAIL_LFSCK_DELAY2 0x1601
673 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
674 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
675 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
676 mdd.${MDT_DEV}.lfsck_namespace |
677 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
679 error "(5) unexpected status"
682 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
683 [ "$STATUS" == "scanning-phase1" ] ||
684 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
687 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
688 mdd.${MDT_DEV}.lfsck_namespace |
689 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
691 error "(7) unexpected status"
694 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
695 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
697 local repaired=$($SHOW_NAMESPACE |
698 awk '/^dirent_repaired/ { print $2 }')
699 # for interop with old server
700 [ -z "$repaired" ] &&
701 repaired=$($SHOW_NAMESPACE |
702 awk '/^updated_phase1/ { print $2 }')
704 [ $repaired -ge 2 ] ||
705 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
709 mount_client $MOUNT || error "(10) Fail to start client!"
711 #define OBD_FAIL_FID_LOOKUP 0x1505
712 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
713 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
715 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
717 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
718 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
719 local dummyname=$($LFS fid2path $DIR $dummyfid)
720 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
721 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
723 run_test 5 "LFSCK can handle IGIF object upgrading"
728 #define OBD_FAIL_LFSCK_DELAY1 0x1600
729 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
730 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
732 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
733 [ "$STATUS" == "scanning-phase1" ] ||
734 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
736 # Sleep 3 sec to guarantee at least one object processed by LFSCK
738 # Fail the LFSCK to guarantee there is at least one checkpoint
739 #define OBD_FAIL_LFSCK_FATAL1 0x1608
740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
741 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
742 mdd.${MDT_DEV}.lfsck_namespace |
743 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
745 error "(4) unexpected status"
748 local POS0=$($SHOW_NAMESPACE |
749 awk '/^last_checkpoint_position/ { print $2 }' |
752 #define OBD_FAIL_LFSCK_DELAY1 0x1600
753 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
754 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
756 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
757 [ "$STATUS" == "scanning-phase1" ] ||
758 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
760 local POS1=$($SHOW_NAMESPACE |
761 awk '/^latest_start_position/ { print $2 }' |
763 [[ $POS0 -lt $POS1 ]] ||
764 error "(7) Expect larger than: $POS0, but got $POS1"
766 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
767 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
768 mdd.${MDT_DEV}.lfsck_namespace |
769 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
771 error "(8) unexpected status"
774 run_test 6a "LFSCK resumes from last checkpoint (1)"
779 #define OBD_FAIL_LFSCK_DELAY2 0x1601
780 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
781 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
783 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
784 [ "$STATUS" == "scanning-phase1" ] ||
785 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
787 # Sleep 5 sec to guarantee that we are in the directory scanning
789 # Fail the LFSCK to guarantee there is at least one checkpoint
790 #define OBD_FAIL_LFSCK_FATAL2 0x1609
791 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
792 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
793 mdd.${MDT_DEV}.lfsck_namespace |
794 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
796 error "(4) unexpected status"
799 local O_POS0=$($SHOW_NAMESPACE |
800 awk '/^last_checkpoint_position/ { print $2 }' |
803 local D_POS0=$($SHOW_NAMESPACE |
804 awk '/^last_checkpoint_position/ { print $4 }')
806 #define OBD_FAIL_LFSCK_DELAY2 0x1601
807 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
808 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
810 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "scanning-phase1" ] ||
812 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
814 local O_POS1=$($SHOW_NAMESPACE |
815 awk '/^latest_start_position/ { print $2 }' |
817 local D_POS1=$($SHOW_NAMESPACE |
818 awk '/^latest_start_position/ { print $4 }')
820 echo "Additional debug for 6b"
822 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
823 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
824 [[ $O_POS0 -lt $O_POS1 ]] ||
825 error "(7.1) $O_POS1 is not larger than $O_POS0"
827 [[ $D_POS0 -lt $D_POS1 ]] ||
828 error "(7.2) $D_POS1 is not larger than $D_POS0"
831 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
832 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
833 mdd.${MDT_DEV}.lfsck_namespace |
834 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
836 error "(8) unexpected status"
839 run_test 6b "LFSCK resumes from last checkpoint (2)"
846 #define OBD_FAIL_LFSCK_DELAY2 0x1601
847 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
848 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
850 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
851 [ "$STATUS" == "scanning-phase1" ] ||
852 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
854 # Sleep 3 sec to guarantee at least one object processed by LFSCK
856 echo "stop $SINGLEMDS"
857 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
859 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
860 echo "start $SINGLEMDS"
861 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
862 error "(5) Fail to start MDS!"
864 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
865 mdd.${MDT_DEV}.lfsck_namespace |
866 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
868 error "(6) unexpected status"
871 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
877 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
878 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
879 for ((i = 0; i < 20; i++)); do
880 touch $DIR/$tdir/dummy${i}
883 #define OBD_FAIL_LFSCK_DELAY3 0x1602
884 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
885 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
886 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
887 mdd.${MDT_DEV}.lfsck_namespace |
888 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
890 error "(4) unexpected status"
894 echo "stop $SINGLEMDS"
895 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
898 echo "start $SINGLEMDS"
899 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
900 error "(6) Fail to start MDS!"
902 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
903 mdd.${MDT_DEV}.lfsck_namespace |
904 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
906 error "(7) unexpected status"
909 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
914 formatall > /dev/null
920 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
921 [ "$STATUS" == "init" ] ||
922 error "(2) Expect 'init', but got '$STATUS'"
924 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
925 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
926 mkdir $DIR/$tdir/crashed
928 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
929 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
930 for ((i = 0; i < 5; i++)); do
931 touch $DIR/$tdir/dummy${i}
934 umount_client $MOUNT || error "(3) Fail to stop client!"
936 #define OBD_FAIL_LFSCK_DELAY2 0x1601
937 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
938 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
940 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
941 [ "$STATUS" == "scanning-phase1" ] ||
942 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
944 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
946 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
947 [ "$STATUS" == "stopped" ] ||
948 error "(7) Expect 'stopped', but got '$STATUS'"
950 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
952 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
953 [ "$STATUS" == "scanning-phase1" ] ||
954 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
956 #define OBD_FAIL_LFSCK_FATAL2 0x1609
957 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
958 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
959 mdd.${MDT_DEV}.lfsck_namespace |
960 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
962 error "(10) unexpected status"
965 #define OBD_FAIL_LFSCK_DELAY1 0x1600
966 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
967 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
969 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
970 [ "$STATUS" == "scanning-phase1" ] ||
971 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
973 #define OBD_FAIL_LFSCK_CRASH 0x160a
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
977 echo "stop $SINGLEMDS"
978 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
980 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
981 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
983 echo "start $SINGLEMDS"
984 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
985 error "(14) Fail to start MDS!"
987 local timeout=$(max_recovery_time)
990 while [ $timer -lt $timeout ]; do
991 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
992 mdt.${MDT_DEV}.recovery_status |
993 awk '/^status/ { print \\\$2 }'")
994 [ "$STATUS" != "RECOVERING" ] && break;
999 [ $timer != $timeout ] ||
1000 error "(14.1) recovery timeout"
1002 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1003 [ "$STATUS" == "crashed" ] ||
1004 error "(15) Expect 'crashed', but got '$STATUS'"
1006 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1007 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1008 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
1010 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1011 [ "$STATUS" == "scanning-phase1" ] ||
1012 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1014 echo "stop $SINGLEMDS"
1015 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1017 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1018 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1020 echo "start $SINGLEMDS"
1021 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1022 error "(19) Fail to start MDS!"
1025 while [ $timer -lt $timeout ]; do
1026 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1027 mdt.${MDT_DEV}.recovery_status |
1028 awk '/^status/ { print \\\$2 }'")
1029 [ "$STATUS" != "RECOVERING" ] && break;
1031 timer=$((timer + 1))
1034 [ $timer != $timeout ] ||
1035 error "(19.1) recovery timeout"
1037 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "paused" ] ||
1039 error "(20) Expect 'paused', but got '$STATUS'"
1041 echo "stop $SINGLEMDS"
1042 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1044 echo "start $SINGLEMDS without resume LFSCK"
1045 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1046 error "(20.2) Fail to start MDS!"
1049 while [ $timer -lt $timeout ]; do
1050 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1051 mdt.${MDT_DEV}.recovery_status |
1052 awk '/^status/ { print \\\$2 }'")
1053 [ "$STATUS" != "RECOVERING" ] && break;
1055 timer=$((timer + 1))
1058 [ $timer != $timeout ] ||
1059 error "(20.3) recovery timeout"
1061 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1062 [ "$STATUS" == "paused" ] ||
1063 error "(20.4) Expect 'paused', but got '$STATUS'"
1065 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1066 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1068 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1069 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1070 mdd.${MDT_DEV}.lfsck_namespace |
1071 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1073 error "(22) unexpected status"
1076 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1077 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1078 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1080 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1081 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1082 mdd.${MDT_DEV}.lfsck_namespace |
1083 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1085 error "(24) unexpected status"
1088 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1089 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1091 run_test 8 "LFSCK state machine"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 check_mount_and_prep
1100 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1101 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1102 createmany -o $DIR/$tdir/lfsck/f 5000
1104 local BASE_SPEED1=100
1106 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1109 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1110 [ "$STATUS" == "scanning-phase1" ] ||
1111 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1113 local SPEED=$($SHOW_LAYOUT |
1114 awk '/^average_speed_phase1/ { print $2 }')
1116 # There may be time error, normally it should be less than 2 seconds.
1117 # We allow another 20% schedule error.
1119 # MAX_MARGIN = 1.3 = 13 / 10
1120 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1121 RUN_TIME1 * 13 / 10))
1122 [ $SPEED -lt $MAX_SPEED ] || {
1124 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1125 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1128 # adjust speed limit
1129 local BASE_SPEED2=300
1131 do_facet $SINGLEMDS \
1132 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1135 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1136 # MIN_MARGIN = 0.7 = 7 / 10
1137 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1138 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1139 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1140 [ $SPEED -gt $MIN_SPEED ] || {
1141 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1142 error_ignore LU-5624 \
1143 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1146 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1150 # MAX_MARGIN = 1.3 = 13 / 10
1151 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1152 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1153 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1154 [ $SPEED -lt $MAX_SPEED ] || {
1156 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1157 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1158 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1161 do_nodes $(comma_list $(mdts_nodes)) \
1162 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1163 do_nodes $(comma_list $(osts_nodes)) \
1164 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1166 wait_update_facet $SINGLEMDS \
1167 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1168 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1169 error "(7) Failed to get expected 'completed'"
1171 run_test 9a "LFSCK speed control (1)"
1174 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1175 skip "Testing on UP system, the speed may be inaccurate."
1181 echo "Preparing another 50 * 50 files (with error) at $(date)."
1182 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1184 createmany -d $DIR/$tdir/d 50
1185 createmany -m $DIR/$tdir/f 50
1186 for ((i = 0; i < 50; i++)); do
1187 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1190 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1192 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1193 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1194 mdd.${MDT_DEV}.lfsck_namespace |
1195 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1197 error "(5) unexpected status"
1200 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1201 echo "Prepared at $(date)."
1203 local BASE_SPEED1=50
1205 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1208 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1209 [ "$STATUS" == "scanning-phase2" ] ||
1210 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1212 local SPEED=$($SHOW_NAMESPACE |
1213 awk '/^average_speed_phase2/ { print $2 }')
1214 # There may be time error, normally it should be less than 2 seconds.
1215 # We allow another 20% schedule error.
1217 # MAX_MARGIN = 1.3 = 13 / 10
1218 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1219 RUN_TIME1 * 13 / 10))
1220 [ $SPEED -lt $MAX_SPEED ] || {
1222 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1223 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1226 # adjust speed limit
1227 local BASE_SPEED2=150
1229 do_facet $SINGLEMDS \
1230 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1233 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1234 # MIN_MARGIN = 0.7 = 7 / 10
1235 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1236 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1237 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1238 [ $SPEED -gt $MIN_SPEED ] || {
1239 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1240 error_ignore LU-5624 \
1241 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1244 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1248 # MAX_MARGIN = 1.3 = 13 / 10
1249 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1250 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1251 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1252 [ $SPEED -lt $MAX_SPEED ] || {
1254 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1255 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1256 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1259 do_nodes $(comma_list $(mdts_nodes)) \
1260 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1261 do_nodes $(comma_list $(osts_nodes)) \
1262 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1263 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1264 mdd.${MDT_DEV}.lfsck_namespace |
1265 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1267 error "(11) unexpected status"
1270 run_test 9b "LFSCK speed control (2)"
1274 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1275 skip "lookup(..)/linkea on ZFS issue" && return
1279 echo "Preparing more files with error at $(date)."
1280 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1281 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1283 for ((i = 0; i < 1000; i = $((i+2)))); do
1284 mkdir -p $DIR/$tdir/d${i}
1285 touch $DIR/$tdir/f${i}
1286 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1289 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1292 for ((i = 1; i < 1000; i = $((i+2)))); do
1293 mkdir -p $DIR/$tdir/d${i}
1294 touch $DIR/$tdir/f${i}
1295 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1299 echo "Prepared at $(date)."
1301 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1303 umount_client $MOUNT
1304 mount_client $MOUNT || error "(3) Fail to start client!"
1306 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1309 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1310 [ "$STATUS" == "scanning-phase1" ] ||
1311 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1313 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1315 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1317 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1319 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1321 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1323 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1325 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1327 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1328 error "(14) Fail to softlink!"
1330 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1331 [ "$STATUS" == "scanning-phase1" ] ||
1332 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1334 do_nodes $(comma_list $(mdts_nodes)) \
1335 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1336 do_nodes $(comma_list $(osts_nodes)) \
1337 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1338 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1339 mdd.${MDT_DEV}.lfsck_namespace |
1340 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1342 error "(16) unexpected status"
1345 run_test 10 "System is available during LFSCK scanning"
1348 ost_remove_lastid() {
1351 local rcmd="do_facet ost${ost}"
1353 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1355 # step 1: local mount
1356 mount_fstype ost${ost} || return 1
1357 # step 2: remove the specified LAST_ID
1358 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1360 unmount_fstype ost${ost} || return 2
1364 check_mount_and_prep
1365 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1366 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1371 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1373 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1374 error "(2) Fail to start ost1"
1376 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1377 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1379 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1380 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1382 wait_update_facet ost1 "$LCTL get_param -n \
1383 obdfilter.${OST_DEV}.lfsck_layout |
1384 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1386 error "(5) unexpected status"
1389 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1391 wait_update_facet ost1 "$LCTL get_param -n \
1392 obdfilter.${OST_DEV}.lfsck_layout |
1393 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1395 error "(6) unexpected status"
1398 echo "the LAST_ID(s) should have been rebuilt"
1399 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1400 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1402 run_test 11a "LFSCK can rebuild lost last_id"
1405 check_mount_and_prep
1406 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1408 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1409 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1410 do_facet ost1 $LCTL set_param fail_loc=0x160d
1412 local count=$(precreated_ost_obj_count 0 0)
1414 createmany -o $DIR/$tdir/f $((count + 32))
1416 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1417 local seq=$(do_facet mds1 $LCTL get_param -n \
1418 osp.${proc_path}.prealloc_last_seq)
1419 local id_used=$(do_facet mds1 $LCTL get_param -n \
1420 osp.${proc_path}.prealloc_last_id)
1422 umount_client $MOUNT
1423 stop ost1 || error "(1) Fail to stop ost1"
1425 #define OBD_FAIL_OST_ENOSPC 0x215
1426 do_facet ost1 $LCTL set_param fail_loc=0x215
1428 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1429 error "(2) Fail to start ost1"
1431 for ((i = 0; i < 60; i++)); do
1432 id_ost1=$(do_facet ost1 \
1433 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1434 awk -F: "/$seq/ { print \$2 }")
1435 [ -n "$id_ost1" ] && break
1439 echo "the on-disk LAST_ID should be smaller than the expected one"
1440 [ $id_used -gt $id_ost1 ] ||
1441 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1443 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1444 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1446 wait_update_facet ost1 \
1447 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1448 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1450 error "(6) unexpected status"
1453 stop ost1 || error "(7) Fail to stop ost1"
1455 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1456 error "(8) Fail to start ost1"
1458 echo "the on-disk LAST_ID should have been rebuilt"
1459 # last_id may be larger than $id_used if objects were created/skipped
1460 wait_update_facet_cond ost1 \
1461 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1462 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1463 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1464 error "(9) expect last_id >= id_used $seq:$id_used"
1467 do_facet ost1 $LCTL set_param fail_loc=0
1468 stopall || error "(10) Fail to stopall"
1470 run_test 11b "LFSCK can rebuild crashed last_id"
1473 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1475 check_mount_and_prep
1476 for k in $(seq $MDSCOUNT); do
1477 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1478 createmany -o $DIR/$tdir/${k}/f 100 ||
1479 error "(0) Fail to create 100 files."
1482 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1483 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1484 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1486 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1487 wait_all_targets namespace scanning-phase1 3
1489 echo "Stop namespace LFSCK on all targets by single lctl command."
1490 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1491 error "(4) Fail to stop LFSCK on all devices!"
1493 echo "All the LFSCK targets should be in 'stopped' status."
1494 wait_all_targets_blocked namespace stopped 5
1496 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1497 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1498 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1500 echo "All the LFSCK targets should be in 'completed' status."
1501 wait_all_targets_blocked namespace completed 7
1503 start_full_debug_logging
1505 echo "Start layout LFSCK on all targets by single command (-s 1)."
1506 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1507 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1509 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1510 wait_all_targets layout scanning-phase1 9
1512 echo "Stop layout LFSCK on all targets by single lctl command."
1513 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1514 error "(10) Fail to stop LFSCK on all devices!"
1516 echo "All the LFSCK targets should be in 'stopped' status."
1517 wait_all_targets_blocked layout stopped 11
1519 for k in $(seq $OSTCOUNT); do
1520 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1521 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1522 awk '/^status/ { print $2 }')
1523 [ "$STATUS" == "stopped" ] ||
1524 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1527 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1528 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1529 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1531 echo "All the LFSCK targets should be in 'completed' status."
1532 wait_all_targets_blocked layout completed 14
1534 stop_full_debug_logging
1536 run_test 12a "single command to trigger LFSCK on all devices"
1539 check_mount_and_prep
1541 echo "Start LFSCK without '-M' specified."
1542 do_facet mds1 $LCTL lfsck_start -A -r ||
1543 error "(0) Fail to start LFSCK without '-M'"
1545 wait_all_targets_blocked namespace completed 1
1546 wait_all_targets_blocked layout completed 2
1548 local count=$(do_facet mds1 $LCTL dl |
1549 awk '{ print $3 }' | grep mdt | wc -l)
1550 if [ $count -gt 1 ]; then
1552 echo "Start layout LFSCK on the node with multipe targets,"
1553 echo "but not specify '-M'/'-A' option. Should get failure."
1555 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1556 error "(3) Start layout LFSCK should fail" || true
1559 run_test 12b "auto detect Lustre device"
1563 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1564 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1565 echo "MDT-object FID."
1568 check_mount_and_prep
1570 echo "Inject failure stub to simulate bad lmm_oi"
1571 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1572 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1573 createmany -o $DIR/$tdir/f 1
1574 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1575 error "(0) Fail to create PFL $DIR/$tdir/f1"
1576 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1578 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1579 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1581 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1582 mdd.${MDT_DEV}.lfsck_layout |
1583 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1585 error "(2) unexpected status"
1588 local repaired=$($SHOW_LAYOUT |
1589 awk '/^repaired_others/ { print $2 }')
1590 [ $repaired -eq 2 ] ||
1591 error "(3) Fail to repair crashed lmm_oi: $repaired"
1593 run_test 13 "LFSCK can repair crashed lmm_oi"
1597 echo "The OST-object referenced by the MDT-object should be there;"
1598 echo "otherwise, the LFSCK should re-create the missing OST-object."
1599 echo "without '--delay-create-ostobj' option."
1602 check_mount_and_prep
1603 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1605 echo "Inject failure stub to simulate dangling referenced MDT-object"
1606 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1607 do_facet ost1 $LCTL set_param fail_loc=0x1610
1608 local count=$(precreated_ost_obj_count 0 0)
1610 createmany -o $DIR/$tdir/f $((count + 16)) ||
1611 error "(0.1) Fail to create $DIR/$tdir/fx"
1612 touch $DIR/$tdir/guard0
1614 for ((i = 0; i < 16; i++)); do
1615 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1616 $DIR/$tdir/f_comp${i} ||
1617 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1619 touch $DIR/$tdir/guard1
1621 do_facet ost1 $LCTL set_param fail_loc=0
1623 start_full_debug_logging
1625 # exhaust other pre-created dangling cases
1626 count=$(precreated_ost_obj_count 0 0)
1627 createmany -o $DIR/$tdir/a $count ||
1628 error "(0.5) Fail to create $count files."
1630 echo "'ls' should fail because of dangling referenced MDT-object"
1631 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1633 echo "Trigger layout LFSCK to find out dangling reference"
1634 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1636 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1637 mdd.${MDT_DEV}.lfsck_layout |
1638 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1640 error "(3) unexpected status"
1643 local repaired=$($SHOW_LAYOUT |
1644 awk '/^repaired_dangling/ { print $2 }')
1645 [ $repaired -ge 32 ] ||
1646 error "(4) Fail to repair dangling reference: $repaired"
1648 echo "'stat' should fail because of not repair dangling by default"
1649 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1650 error "(5.1) stat should fail"
1651 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1652 error "(5.2) stat should fail"
1654 echo "Trigger layout LFSCK to repair dangling reference"
1655 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1657 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1658 mdd.${MDT_DEV}.lfsck_layout |
1659 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1661 error "(7) unexpected status"
1664 # There may be some async LFSCK updates in processing, wait for
1665 # a while until the target reparation has been done. LU-4970.
1667 echo "'stat' should success after layout LFSCK repairing"
1668 wait_update_facet client "stat $DIR/$tdir/guard0 |
1669 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1670 stat $DIR/$tdir/guard0
1672 error "(8.1) unexpected size"
1675 wait_update_facet client "stat $DIR/$tdir/guard1 |
1676 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1677 stat $DIR/$tdir/guard1
1679 error "(8.2) unexpected size"
1682 repaired=$($SHOW_LAYOUT |
1683 awk '/^repaired_dangling/ { print $2 }')
1684 [ $repaired -ge 32 ] ||
1685 error "(9) Fail to repair dangling reference: $repaired"
1687 stop_full_debug_logging
1689 echo "stopall to cleanup object cache"
1692 setupall > /dev/null
1694 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1698 echo "The OST-object referenced by the MDT-object should be there;"
1699 echo "otherwise, the LFSCK should re-create the missing OST-object."
1700 echo "with '--delay-create-ostobj' option."
1703 check_mount_and_prep
1704 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1706 echo "Inject failure stub to simulate dangling referenced MDT-object"
1707 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1708 do_facet ost1 $LCTL set_param fail_loc=0x1610
1709 local count=$(precreated_ost_obj_count 0 0)
1711 createmany -o $DIR/$tdir/f $((count + 31))
1712 touch $DIR/$tdir/guard
1713 do_facet ost1 $LCTL set_param fail_loc=0
1715 start_full_debug_logging
1717 # exhaust other pre-created dangling cases
1718 count=$(precreated_ost_obj_count 0 0)
1719 createmany -o $DIR/$tdir/a $count ||
1720 error "(0) Fail to create $count files."
1722 echo "'ls' should fail because of dangling referenced MDT-object"
1723 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1725 echo "Trigger layout LFSCK to find out dangling reference"
1726 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1728 wait_all_targets_blocked layout completed 3
1730 local repaired=$($SHOW_LAYOUT |
1731 awk '/^repaired_dangling/ { print $2 }')
1732 [ $repaired -ge 32 ] ||
1733 error "(4) Fail to repair dangling reference: $repaired"
1735 echo "'stat' should fail because of not repair dangling by default"
1736 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1738 echo "Trigger layout LFSCK to repair dangling reference"
1739 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1741 wait_all_targets_blocked layout completed 7
1743 # There may be some async LFSCK updates in processing, wait for
1744 # a while until the target reparation has been done. LU-4970.
1746 echo "'stat' should success after layout LFSCK repairing"
1747 wait_update_facet client "stat $DIR/$tdir/guard |
1748 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1749 stat $DIR/$tdir/guard
1751 error "(8) unexpected size"
1754 repaired=$($SHOW_LAYOUT |
1755 awk '/^repaired_dangling/ { print $2 }')
1756 [ $repaired -ge 32 ] ||
1757 error "(9) Fail to repair dangling reference: $repaired"
1759 stop_full_debug_logging
1761 echo "stopall to cleanup object cache"
1764 setupall > /dev/null
1766 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1770 echo "If the OST-object referenced by the MDT-object back points"
1771 echo "to some non-exist MDT-object, then the LFSCK should repair"
1772 echo "the OST-object to back point to the right MDT-object."
1775 check_mount_and_prep
1776 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1778 echo "Inject failure stub to make the OST-object to back point to"
1779 echo "non-exist MDT-object."
1780 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1782 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1783 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1784 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1786 error "(0) Fail to create PFL $DIR/$tdir/f1"
1787 # 'dd' will trigger punch RPC firstly on every OST-objects.
1788 # So even though some OST-object will not be write by 'dd',
1789 # as long as it is allocated (may be NOT allocated in pfl_3b)
1790 # its layout information will be set also.
1791 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1792 cancel_lru_locks osc
1793 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1795 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1796 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1798 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1799 mdd.${MDT_DEV}.lfsck_layout |
1800 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1802 error "(2) unexpected status"
1805 local repaired=$($SHOW_LAYOUT |
1806 awk '/^repaired_unmatched_pair/ { print $2 }')
1807 [ $repaired -ge 3 ] ||
1808 error "(3) Fail to repair unmatched pair: $repaired"
1810 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1814 echo "If the OST-object referenced by the MDT-object back points"
1815 echo "to other MDT-object that doesn't recognize the OST-object,"
1816 echo "then the LFSCK should repair it to back point to the right"
1817 echo "MDT-object (the first one)."
1820 check_mount_and_prep
1821 mkdir -p $DIR/$tdir/0
1822 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1823 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1824 cancel_lru_locks osc
1826 echo "Inject failure stub to make the OST-object to back point to"
1827 echo "other MDT-object"
1830 [ $OSTCOUNT -ge 2 ] && stripes=2
1832 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1833 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1834 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1835 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1837 error "(0) Fail to create PFL $DIR/$tdir/f1"
1838 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1839 cancel_lru_locks osc
1840 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1842 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1843 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1845 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1846 mdd.${MDT_DEV}.lfsck_layout |
1847 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1849 error "(2) unexpected status"
1852 local repaired=$($SHOW_LAYOUT |
1853 awk '/^repaired_unmatched_pair/ { print $2 }')
1854 [ $repaired -eq 4 ] ||
1855 error "(3) Fail to repair unmatched pair: $repaired"
1857 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1860 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1862 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1863 skip "Skip the test after 2.7.55 see LU-6437" && return
1866 echo "According to current metadata migration implementation,"
1867 echo "before the old MDT-object is removed, both the new MDT-object"
1868 echo "and old MDT-object will reference the same LOV layout. Then if"
1869 echo "the layout LFSCK finds the new MDT-object by race, it will"
1870 echo "regard related OST-object(s) as multiple referenced case, and"
1871 echo "will try to create new OST-object(s) for the new MDT-object."
1872 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1873 echo "MDT-object before confirm the multiple referenced case."
1876 check_mount_and_prep
1877 $LFS mkdir -i 1 $DIR/$tdir/a1
1878 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1879 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1880 cancel_lru_locks osc
1882 echo "Inject failure stub on MDT1 to delay the migration"
1884 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1885 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1886 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1887 $LFS migrate -m 0 $DIR/$tdir/a1 &
1890 echo "Trigger layout LFSCK to race with the migration"
1891 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1893 wait_all_targets_blocked layout completed 2
1895 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1896 local repaired=$($SHOW_LAYOUT |
1897 awk '/^repaired_unmatched_pair/ { print $2 }')
1898 [ $repaired -eq 1 ] ||
1899 error "(3) Fail to repair unmatched pair: $repaired"
1901 repaired=$($SHOW_LAYOUT |
1902 awk '/^repaired_multiple_referenced/ { print $2 }')
1903 [ $repaired -eq 0 ] ||
1904 error "(4) Unexpectedly repaird multiple references: $repaired"
1906 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1910 echo "If the OST-object's owner information does not match the owner"
1911 echo "information stored in the MDT-object, then the LFSCK trust the"
1912 echo "MDT-object and update the OST-object's owner information."
1915 check_mount_and_prep
1916 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1917 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1918 cancel_lru_locks osc
1920 # created but no setattr or write to the file.
1922 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1923 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1925 echo "Inject failure stub to skip OST-object owner changing"
1926 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1927 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1928 chown 1.1 $DIR/$tdir/f0
1929 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1931 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1934 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1936 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1937 mdd.${MDT_DEV}.lfsck_layout |
1938 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1940 error "(2) unexpected status"
1943 local repaired=$($SHOW_LAYOUT |
1944 awk '/^repaired_inconsistent_owner/ { print $2 }')
1945 [ $repaired -eq 1 ] ||
1946 error "(3) Fail to repair inconsistent owner: $repaired"
1948 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1952 echo "If more than one MDT-objects reference the same OST-object,"
1953 echo "and the OST-object only recognizes one MDT-object, then the"
1954 echo "LFSCK should create new OST-objects for such non-recognized"
1958 check_mount_and_prep
1959 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1961 echo "Inject failure stub to make two MDT-objects to refernce"
1962 echo "the OST-object"
1964 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1965 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1966 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1967 cancel_lru_locks mdc
1968 cancel_lru_locks osc
1970 createmany -o $DIR/$tdir/f 1
1971 cancel_lru_locks mdc
1972 cancel_lru_locks osc
1974 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1976 error "(0) Fail to create PFL $DIR/$tdir/f1"
1977 cancel_lru_locks mdc
1978 cancel_lru_locks osc
1979 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1981 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1982 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1983 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1984 [ $size -eq 1048576 ] ||
1985 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1987 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1988 [ $size -eq 1048576 ] ||
1989 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1991 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1994 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1996 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1997 mdd.${MDT_DEV}.lfsck_layout |
1998 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2000 error "(3) unexpected status"
2003 local repaired=$($SHOW_LAYOUT |
2004 awk '/^repaired_multiple_referenced/ { print $2 }')
2005 [ $repaired -eq 2 ] ||
2006 error "(4) Fail to repair multiple references: $repaired"
2008 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2009 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2010 error "(5) Fail to write f0."
2011 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2012 [ $size -eq 1048576 ] ||
2013 error "(6) guard size should be 1048576, but got $size"
2015 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2016 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2017 error "(7) Fail to write f1."
2018 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2019 [ $size -eq 1048576 ] ||
2020 error "(8) guard size should be 1048576, but got $size"
2022 run_test 17 "LFSCK can repair multiple references"
2024 $LCTL set_param debug=+cache > /dev/null
2028 echo "The target MDT-object is there, but related stripe information"
2029 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2030 echo "layout EA entries."
2033 check_mount_and_prep
2034 $LFS mkdir -i 0 $DIR/$tdir/a1
2035 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2036 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2038 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2040 $LFS path2fid $DIR/$tdir/a1/f1
2041 $LFS getstripe $DIR/$tdir/a1/f1
2043 if [ $MDSCOUNT -ge 2 ]; then
2044 $LFS mkdir -i 1 $DIR/$tdir/a2
2045 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2046 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2047 $LFS path2fid $DIR/$tdir/a2/f2
2048 $LFS getstripe $DIR/$tdir/a2/f2
2051 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2052 error "(0) Fail to create PFL $DIR/$tdir/f3"
2054 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2056 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2058 $LFS path2fid $DIR/$tdir/f3
2059 $LFS getstripe $DIR/$tdir/f3
2061 cancel_lru_locks osc
2063 echo "Inject failure, to make the MDT-object lost its layout EA"
2064 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2065 do_facet mds1 $LCTL set_param fail_loc=0x1615
2066 chown 1.1 $DIR/$tdir/a1/f1
2068 if [ $MDSCOUNT -ge 2 ]; then
2069 do_facet mds2 $LCTL set_param fail_loc=0x1615
2070 chown 1.1 $DIR/$tdir/a2/f2
2073 chown 1.1 $DIR/$tdir/f3
2078 do_facet mds1 $LCTL set_param fail_loc=0
2079 if [ $MDSCOUNT -ge 2 ]; then
2080 do_facet mds2 $LCTL set_param fail_loc=0
2083 cancel_lru_locks mdc
2084 cancel_lru_locks osc
2086 echo "The file size should be incorrect since layout EA is lost"
2087 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2088 [ "$cur_size" != "$saved_size1" ] ||
2089 error "(1) Expect incorrect file1 size"
2091 if [ $MDSCOUNT -ge 2 ]; then
2092 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2093 [ "$cur_size" != "$saved_size1" ] ||
2094 error "(2) Expect incorrect file2 size"
2097 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2098 [ "$cur_size" != "$saved_size2" ] ||
2099 error "(1.2) Expect incorrect file3 size"
2101 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2102 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2104 for k in $(seq $MDSCOUNT); do
2105 # The LFSCK status query internal is 30 seconds. For the case
2106 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2107 # time to guarantee the status sync up.
2108 wait_update_facet mds${k} "$LCTL get_param -n \
2109 mdd.$(facet_svc mds${k}).lfsck_layout |
2110 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2111 error "(4) MDS${k} is not the expected 'completed'"
2114 for k in $(seq $OSTCOUNT); do
2115 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2116 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2117 awk '/^status/ { print $2 }')
2118 [ "$cur_status" == "completed" ] ||
2119 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2122 local repaired=$(do_facet mds1 $LCTL get_param -n \
2123 mdd.$(facet_svc mds1).lfsck_layout |
2124 awk '/^repaired_orphan/ { print $2 }')
2125 [ $repaired -eq 3 ] ||
2126 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2128 if [ $MDSCOUNT -ge 2 ]; then
2129 repaired=$(do_facet mds2 $LCTL get_param -n \
2130 mdd.$(facet_svc mds2).lfsck_layout |
2131 awk '/^repaired_orphan/ { print $2 }')
2132 [ $repaired -eq 2 ] ||
2133 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2136 $LFS path2fid $DIR/$tdir/a1/f1
2137 $LFS getstripe $DIR/$tdir/a1/f1
2139 if [ $MDSCOUNT -ge 2 ]; then
2140 $LFS path2fid $DIR/$tdir/a2/f2
2141 $LFS getstripe $DIR/$tdir/a2/f2
2144 $LFS path2fid $DIR/$tdir/f3
2145 $LFS getstripe $DIR/$tdir/f3
2147 echo "The file size should be correct after layout LFSCK scanning"
2148 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2149 [ "$cur_size" == "$saved_size1" ] ||
2150 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2152 if [ $MDSCOUNT -ge 2 ]; then
2153 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2154 [ "$cur_size" == "$saved_size1" ] ||
2155 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2158 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2159 [ "$cur_size" == "$saved_size2" ] ||
2160 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2162 run_test 18a "Find out orphan OST-object and repair it (1)"
2165 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2168 echo "The target MDT-object is lost. The LFSCK should re-create the"
2169 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2170 echo "can move it back to normal namespace manually."
2173 check_mount_and_prep
2174 $LFS mkdir -i 0 $DIR/$tdir/a1
2175 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2176 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2177 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2178 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2180 $LFS getstripe $DIR/$tdir/a1/f1
2182 if [ $MDSCOUNT -ge 2 ]; then
2183 $LFS mkdir -i 1 $DIR/$tdir/a2
2184 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2185 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2186 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2188 $LFS getstripe $DIR/$tdir/a2/f2
2191 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2192 error "(0) Fail to create PFL $DIR/$tdir/f3"
2194 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2196 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2197 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2199 $LFS getstripe $DIR/$tdir/f3
2201 cancel_lru_locks osc
2203 echo "Inject failure, to simulate the case of missing the MDT-object"
2204 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2205 do_facet mds1 $LCTL set_param fail_loc=0x1616
2206 rm -f $DIR/$tdir/a1/f1
2208 if [ $MDSCOUNT -ge 2 ]; then
2209 do_facet mds2 $LCTL set_param fail_loc=0x1616
2210 rm -f $DIR/$tdir/a2/f2
2218 do_facet mds1 $LCTL set_param fail_loc=0
2219 if [ $MDSCOUNT -ge 2 ]; then
2220 do_facet mds2 $LCTL set_param fail_loc=0
2223 cancel_lru_locks mdc
2224 cancel_lru_locks osc
2226 # dryrun mode only check orphans, not repaie
2227 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2228 $START_LAYOUT --dryrun -o -r ||
2229 error "Fail to start layout LFSCK in dryrun mode"
2230 wait_all_targets_blocked layout completed 2
2232 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2233 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2234 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2236 local orphans=$(do_facet mds1 $LCTL get_param -n \
2237 mdd.$(facet_svc mds1).lfsck_layout |
2238 awk '/^inconsistent_orphan/ { print $2 }')
2239 [ $orphans -eq 3 ] ||
2240 error "Expect 3 found on mds1, but got: $orphans"
2242 # orphan parents should not be created
2244 for subdir in $MOUNT/.lustre/lost+found/*; do
2245 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2248 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2249 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2251 for k in $(seq $MDSCOUNT); do
2252 # The LFSCK status query internal is 30 seconds. For the case
2253 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2254 # time to guarantee the status sync up.
2255 wait_update_facet mds${k} "$LCTL get_param -n \
2256 mdd.$(facet_svc mds${k}).lfsck_layout |
2257 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2258 error "(2) MDS${k} is not the expected 'completed'"
2261 for k in $(seq $OSTCOUNT); do
2262 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2263 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2264 awk '/^status/ { print $2 }')
2265 [ "$cur_status" == "completed" ] ||
2266 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2269 local repaired=$(do_facet mds1 $LCTL get_param -n \
2270 mdd.$(facet_svc mds1).lfsck_layout |
2271 awk '/^repaired_orphan/ { print $2 }')
2272 [ $repaired -eq 3 ] ||
2273 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2275 if [ $MDSCOUNT -ge 2 ]; then
2276 repaired=$(do_facet mds2 $LCTL get_param -n \
2277 mdd.$(facet_svc mds2).lfsck_layout |
2278 awk '/^repaired_orphan/ { print $2 }')
2279 [ $repaired -eq 2 ] ||
2280 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2283 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2284 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2285 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2287 if [ $MDSCOUNT -ge 2 ]; then
2288 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2289 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2292 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2293 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2295 $LFS path2fid $DIR/$tdir/a1/f1
2296 $LFS getstripe $DIR/$tdir/a1/f1
2298 if [ $MDSCOUNT -ge 2 ]; then
2299 $LFS path2fid $DIR/$tdir/a2/f2
2300 $LFS getstripe $DIR/$tdir/a2/f2
2303 $LFS path2fid $DIR/$tdir/f3
2304 $LFS getstripe $DIR/$tdir/f3
2306 echo "The file size should be correct after layout LFSCK scanning"
2307 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2308 [ "$cur_size" == "$saved_size1" ] ||
2309 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2311 if [ $MDSCOUNT -ge 2 ]; then
2312 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2313 [ "$cur_size" == "$saved_size1" ] ||
2314 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2317 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2318 [ "$cur_size" == "$saved_size2" ] ||
2319 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2321 run_test 18b "Find out orphan OST-object and repair it (2)"
2324 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2327 echo "The target MDT-object is lost, and the OST-object FID is missing."
2328 echo "The LFSCK should re-create the MDT-object with new FID under the "
2329 echo "directory .lustre/lost+found/MDTxxxx."
2332 check_mount_and_prep
2333 $LFS mkdir -i 0 $DIR/$tdir/a1
2334 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2336 echo "Inject failure, to simulate the case of missing parent FID"
2337 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2338 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2340 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2341 $LFS getstripe $DIR/$tdir/a1/f1
2343 if [ $MDSCOUNT -ge 2 ]; then
2344 $LFS mkdir -i 1 $DIR/$tdir/a2
2345 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2346 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2347 $LFS getstripe $DIR/$tdir/a2/f2
2350 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2351 error "(0) Fail to create PFL $DIR/$tdir/f3"
2353 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2354 $LFS getstripe $DIR/$tdir/f3
2356 cancel_lru_locks osc
2357 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2359 echo "Inject failure, to simulate the case of missing the MDT-object"
2360 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2361 do_facet mds1 $LCTL set_param fail_loc=0x1616
2362 rm -f $DIR/$tdir/a1/f1
2364 if [ $MDSCOUNT -ge 2 ]; then
2365 do_facet mds2 $LCTL set_param fail_loc=0x1616
2366 rm -f $DIR/$tdir/a2/f2
2374 do_facet mds1 $LCTL set_param fail_loc=0
2375 if [ $MDSCOUNT -ge 2 ]; then
2376 do_facet mds2 $LCTL set_param fail_loc=0
2379 cancel_lru_locks mdc
2380 cancel_lru_locks osc
2382 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2383 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2385 for k in $(seq $MDSCOUNT); do
2386 # The LFSCK status query internal is 30 seconds. For the case
2387 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2388 # time to guarantee the status sync up.
2389 wait_update_facet mds${k} "$LCTL get_param -n \
2390 mdd.$(facet_svc mds${k}).lfsck_layout |
2391 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2392 error "(2) MDS${k} is not the expected 'completed'"
2395 for k in $(seq $OSTCOUNT); do
2396 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2397 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2398 awk '/^status/ { print $2 }')
2399 [ "$cur_status" == "completed" ] ||
2400 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2403 if [ $MDSCOUNT -ge 2 ]; then
2409 local repaired=$(do_facet mds1 $LCTL get_param -n \
2410 mdd.$(facet_svc mds1).lfsck_layout |
2411 awk '/^repaired_orphan/ { print $2 }')
2412 [ $repaired -eq $expected ] ||
2413 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2415 if [ $MDSCOUNT -ge 2 ]; then
2416 repaired=$(do_facet mds2 $LCTL get_param -n \
2417 mdd.$(facet_svc mds2).lfsck_layout |
2418 awk '/^repaired_orphan/ { print $2 }')
2419 [ $repaired -eq 0 ] ||
2420 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2423 ls -ail $MOUNT/.lustre/lost+found/
2425 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2426 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2427 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2429 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2432 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2433 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2434 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2436 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2437 [ ! -z "$cname" ] ||
2438 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2440 run_test 18c "Find out orphan OST-object and repair it (3)"
2444 echo "The target MDT-object layout EA is corrupted, but the right"
2445 echo "OST-object is still alive as orphan. The layout LFSCK will"
2446 echo "not create new OST-object to occupy such slot."
2449 check_mount_and_prep
2451 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2452 echo "guard" > $DIR/$tdir/a1/f1
2453 echo "foo" > $DIR/$tdir/a1/f2
2455 echo "guard" > $DIR/$tdir/a1/f3
2456 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2457 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2458 echo "foo" > $DIR/$tdir/a1/f4
2460 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2461 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2462 $LFS path2fid $DIR/$tdir/a1/f1
2463 $LFS getstripe $DIR/$tdir/a1/f1
2464 $LFS path2fid $DIR/$tdir/a1/f2
2465 $LFS getstripe $DIR/$tdir/a1/f2
2466 $LFS path2fid $DIR/$tdir/a1/f3
2467 $LFS getstripe $DIR/$tdir/a1/f3
2468 $LFS path2fid $DIR/$tdir/a1/f4
2469 $LFS getstripe $DIR/$tdir/a1/f4
2470 cancel_lru_locks osc
2472 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2473 echo "to reference the same OST-object (which is f1's OST-obejct)."
2474 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2475 echo "dangling reference case, but f2's old OST-object is there."
2477 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2478 echo "to reference the same OST-object (which is f3's OST-obejct)."
2479 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2480 echo "dangling reference case, but f4's old OST-object is there."
2483 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2485 chown 1.1 $DIR/$tdir/a1/f2
2486 chown 1.1 $DIR/$tdir/a1/f4
2487 rm -f $DIR/$tdir/a1/f1
2488 rm -f $DIR/$tdir/a1/f3
2491 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2493 echo "stopall to cleanup object cache"
2496 setupall > /dev/null
2498 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2499 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2501 for k in $(seq $MDSCOUNT); do
2502 # The LFSCK status query internal is 30 seconds. For the case
2503 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2504 # time to guarantee the status sync up.
2505 wait_update_facet mds${k} "$LCTL get_param -n \
2506 mdd.$(facet_svc mds${k}).lfsck_layout |
2507 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2508 error "(3) MDS${k} is not the expected 'completed'"
2511 for k in $(seq $OSTCOUNT); do
2512 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2513 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2514 awk '/^status/ { print $2 }')
2515 [ "$cur_status" == "completed" ] ||
2516 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2519 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2520 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2521 awk '/^repaired_orphan/ { print $2 }')
2522 [ $repaired -eq 2 ] ||
2523 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2525 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2526 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2527 awk '/^repaired_dangling/ { print $2 }')
2528 [ $repaired -eq 0 ] ||
2529 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2531 echo "The file size should be correct after layout LFSCK scanning"
2532 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2533 [ "$cur_size" == "$saved_size1" ] ||
2534 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2536 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2537 [ "$cur_size" == "$saved_size2" ] ||
2538 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2540 echo "The LFSCK should find back the original data."
2541 cat $DIR/$tdir/a1/f2
2542 $LFS path2fid $DIR/$tdir/a1/f2
2543 $LFS getstripe $DIR/$tdir/a1/f2
2544 cat $DIR/$tdir/a1/f4
2545 $LFS path2fid $DIR/$tdir/a1/f4
2546 $LFS getstripe $DIR/$tdir/a1/f4
2548 run_test 18d "Find out orphan OST-object and repair it (4)"
2551 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2554 echo "The target MDT-object layout EA slot is occpuied by some new"
2555 echo "created OST-object when repair dangling reference case. Such"
2556 echo "conflict OST-object has been modified by others. To keep the"
2557 echo "new data, the LFSCK will create a new file to refernece this"
2558 echo "old orphan OST-object."
2561 check_mount_and_prep
2563 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2564 echo "guard" > $DIR/$tdir/a1/f1
2565 echo "foo" > $DIR/$tdir/a1/f2
2567 echo "guard" > $DIR/$tdir/a1/f3
2568 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2569 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2570 echo "foo" > $DIR/$tdir/a1/f4
2572 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2573 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2575 $LFS path2fid $DIR/$tdir/a1/f1
2576 $LFS getstripe $DIR/$tdir/a1/f1
2577 $LFS path2fid $DIR/$tdir/a1/f2
2578 $LFS getstripe $DIR/$tdir/a1/f2
2579 $LFS path2fid $DIR/$tdir/a1/f3
2580 $LFS getstripe $DIR/$tdir/a1/f3
2581 $LFS path2fid $DIR/$tdir/a1/f4
2582 $LFS getstripe $DIR/$tdir/a1/f4
2583 cancel_lru_locks osc
2585 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2586 echo "to reference the same OST-object (which is f1's OST-obejct)."
2587 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2588 echo "dangling reference case, but f2's old OST-object is there."
2590 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2591 echo "to reference the same OST-object (which is f3's OST-obejct)."
2592 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2593 echo "dangling reference case, but f4's old OST-object is there."
2596 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2597 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2598 chown 1.1 $DIR/$tdir/a1/f2
2599 chown 1.1 $DIR/$tdir/a1/f4
2600 rm -f $DIR/$tdir/a1/f1
2601 rm -f $DIR/$tdir/a1/f3
2604 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2606 echo "stopall to cleanup object cache"
2609 setupall > /dev/null
2611 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2612 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2614 start_full_debug_logging
2616 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2617 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2619 wait_update_facet mds1 "$LCTL get_param -n \
2620 mdd.$(facet_svc mds1).lfsck_layout |
2621 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2622 error "(3) MDS1 is not the expected 'scanning-phase2'"
2624 # to guarantee all updates are synced.
2628 echo "Write new data to f2/f4 to modify the new created OST-object."
2629 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2630 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2632 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2634 for k in $(seq $MDSCOUNT); do
2635 # The LFSCK status query internal is 30 seconds. For the case
2636 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2637 # time to guarantee the status sync up.
2638 wait_update_facet mds${k} "$LCTL get_param -n \
2639 mdd.$(facet_svc mds${k}).lfsck_layout |
2640 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2641 error "(4) MDS${k} is not the expected 'completed'"
2644 for k in $(seq $OSTCOUNT); do
2645 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2646 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2647 awk '/^status/ { print $2 }')
2648 [ "$cur_status" == "completed" ] ||
2649 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2652 stop_full_debug_logging
2654 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2655 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2656 awk '/^repaired_orphan/ { print $2 }')
2657 [ $repaired -eq 2 ] ||
2658 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2660 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2661 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2662 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2664 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2665 if [ $count -ne 2 ]; then
2666 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2667 error "(8) Expect 2 stubs under lost+found, but got $count"
2670 echo "The stub file should keep the original f2 or f4 data"
2671 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2672 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2673 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2674 error "(9) Got unexpected $cur_size"
2677 $LFS path2fid $cname
2678 $LFS getstripe $cname
2680 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2681 cur_size=$(ls -il $cname | awk '{ print $6 }')
2682 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2683 error "(10) Got unexpected $cur_size"
2686 $LFS path2fid $cname
2687 $LFS getstripe $cname
2689 echo "The f2/f4 should contains new data."
2690 cat $DIR/$tdir/a1/f2
2691 $LFS path2fid $DIR/$tdir/a1/f2
2692 $LFS getstripe $DIR/$tdir/a1/f2
2693 cat $DIR/$tdir/a1/f4
2694 $LFS path2fid $DIR/$tdir/a1/f4
2695 $LFS getstripe $DIR/$tdir/a1/f4
2697 run_test 18e "Find out orphan OST-object and repair it (5)"
2700 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2703 echo "The target MDT-object is lost. The LFSCK should re-create the"
2704 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2705 echo "to verify some OST-object(s) during the first stage-scanning,"
2706 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2707 echo "should not be affected."
2710 check_mount_and_prep
2711 $LFS mkdir -i 0 $DIR/$tdir/a1
2712 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2713 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2714 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2715 $LFS mkdir -i 0 $DIR/$tdir/a2
2716 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2717 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2718 $LFS getstripe $DIR/$tdir/a1/f1
2719 $LFS getstripe $DIR/$tdir/a2/f2
2721 if [ $MDSCOUNT -ge 2 ]; then
2722 $LFS mkdir -i 1 $DIR/$tdir/a3
2723 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2724 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2725 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2726 $LFS mkdir -i 1 $DIR/$tdir/a4
2727 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2728 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2729 $LFS getstripe $DIR/$tdir/a3/f3
2730 $LFS getstripe $DIR/$tdir/a4/f4
2733 cancel_lru_locks osc
2735 echo "Inject failure, to simulate the case of missing the MDT-object"
2736 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2737 do_facet mds1 $LCTL set_param fail_loc=0x1616
2738 rm -f $DIR/$tdir/a1/f1
2739 rm -f $DIR/$tdir/a2/f2
2741 if [ $MDSCOUNT -ge 2 ]; then
2742 do_facet mds2 $LCTL set_param fail_loc=0x1616
2743 rm -f $DIR/$tdir/a3/f3
2744 rm -f $DIR/$tdir/a4/f4
2750 do_facet mds1 $LCTL set_param fail_loc=0
2751 if [ $MDSCOUNT -ge 2 ]; then
2752 do_facet mds2 $LCTL set_param fail_loc=0
2755 cancel_lru_locks mdc
2756 cancel_lru_locks osc
2758 echo "Inject failure, to simulate the OST0 fail to handle"
2759 echo "MDT0 LFSCK request during the first-stage scanning."
2760 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2761 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2763 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2764 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2766 for k in $(seq $MDSCOUNT); do
2767 # The LFSCK status query internal is 30 seconds. For the case
2768 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2769 # time to guarantee the status sync up.
2770 wait_update_facet mds${k} "$LCTL get_param -n \
2771 mdd.$(facet_svc mds${k}).lfsck_layout |
2772 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2773 error "(2) MDS${k} is not the expected 'partial'"
2776 wait_update_facet ost1 "$LCTL get_param -n \
2777 obdfilter.$(facet_svc ost1).lfsck_layout |
2778 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2779 error "(3) OST1 is not the expected 'partial'"
2782 wait_update_facet ost2 "$LCTL get_param -n \
2783 obdfilter.$(facet_svc ost2).lfsck_layout |
2784 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2785 error "(4) OST2 is not the expected 'completed'"
2788 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2790 local repaired=$(do_facet mds1 $LCTL get_param -n \
2791 mdd.$(facet_svc mds1).lfsck_layout |
2792 awk '/^repaired_orphan/ { print $2 }')
2793 [ $repaired -eq 1 ] ||
2794 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2796 if [ $MDSCOUNT -ge 2 ]; then
2797 repaired=$(do_facet mds2 $LCTL get_param -n \
2798 mdd.$(facet_svc mds2).lfsck_layout |
2799 awk '/^repaired_orphan/ { print $2 }')
2800 [ $repaired -eq 1 ] ||
2801 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2804 echo "Trigger layout LFSCK on all devices again to cleanup"
2805 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2807 for k in $(seq $MDSCOUNT); do
2808 # The LFSCK status query internal is 30 seconds. For the case
2809 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2810 # time to guarantee the status sync up.
2811 wait_update_facet mds${k} "$LCTL get_param -n \
2812 mdd.$(facet_svc mds${k}).lfsck_layout |
2813 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2814 error "(8) MDS${k} is not the expected 'completed'"
2817 for k in $(seq $OSTCOUNT); do
2818 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2819 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2820 awk '/^status/ { print $2 }')
2821 [ "$cur_status" == "completed" ] ||
2822 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2826 local repaired=$(do_facet mds1 $LCTL get_param -n \
2827 mdd.$(facet_svc mds1).lfsck_layout |
2828 awk '/^repaired_orphan/ { print $2 }')
2829 [ $repaired -eq 2 ] ||
2830 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2832 if [ $MDSCOUNT -ge 2 ]; then
2833 repaired=$(do_facet mds2 $LCTL get_param -n \
2834 mdd.$(facet_svc mds2).lfsck_layout |
2835 awk '/^repaired_orphan/ { print $2 }')
2836 [ $repaired -eq 2 ] ||
2837 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2840 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2843 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2846 echo "The target MDT-object is lost, but related OI mapping is there"
2847 echo "The LFSCK should recreate the lost MDT-object without affected"
2848 echo "by the stale OI mapping."
2851 check_mount_and_prep
2852 $LFS mkdir -i 0 $DIR/$tdir/a1
2853 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2854 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2855 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2857 $LFS getstripe $DIR/$tdir/a1/f1
2858 cancel_lru_locks osc
2860 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2861 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2862 do_facet mds1 $LCTL set_param fail_loc=0x162e
2863 rm -f $DIR/$tdir/a1/f1
2865 do_facet mds1 $LCTL set_param fail_loc=0
2866 cancel_lru_locks mdc
2867 cancel_lru_locks osc
2869 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2870 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2872 for k in $(seq $MDSCOUNT); do
2873 # The LFSCK status query internal is 30 seconds. For the case
2874 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2875 # time to guarantee the status sync up.
2876 wait_update_facet mds${k} "$LCTL get_param -n \
2877 mdd.$(facet_svc mds${k}).lfsck_layout |
2878 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2879 error "(2) MDS${k} is not the expected 'completed'"
2882 for k in $(seq $OSTCOUNT); do
2883 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2884 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2885 awk '/^status/ { print $2 }')
2886 [ "$cur_status" == "completed" ] ||
2887 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2890 local repaired=$(do_facet mds1 $LCTL get_param -n \
2891 mdd.$(facet_svc mds1).lfsck_layout |
2892 awk '/^repaired_orphan/ { print $2 }')
2893 [ $repaired -eq $OSTCOUNT ] ||
2894 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2896 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2897 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2898 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2900 $LFS path2fid $DIR/$tdir/a1/f1
2901 $LFS getstripe $DIR/$tdir/a1/f1
2903 run_test 18g "Find out orphan OST-object and repair it (7)"
2907 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2908 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2909 echo "scanning its OST-object(s). Then in the second stage scanning,"
2910 echo "the OST will return related OST-object(s) to the MDT as orphan."
2911 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2912 echo "the 'orphan(s)' stripe information."
2915 check_mount_and_prep
2917 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2918 error "(0) Fail to create PFL $DIR/$tdir/f0"
2920 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2921 error "(1.1) Fail to write $DIR/$tdir/f0"
2923 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2924 error "(1.2) Fail to write $DIR/$tdir/f0"
2926 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2928 echo "Inject failure stub to simulate bad PFL extent range"
2929 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2932 chown 1.1 $DIR/$tdir/f0
2934 cancel_lru_locks mdc
2935 cancel_lru_locks osc
2936 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2938 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2939 error "(2) Write to bad PFL file should fail"
2941 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2942 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2944 for k in $(seq $MDSCOUNT); do
2945 # The LFSCK status query internal is 30 seconds. For the case
2946 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2947 # time to guarantee the status sync up.
2948 wait_update_facet mds${k} "$LCTL get_param -n \
2949 mdd.$(facet_svc mds${k}).lfsck_layout |
2950 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2951 error "(4.1) MDS${k} is not the expected 'completed'"
2954 for k in $(seq $OSTCOUNT); do
2955 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2956 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2957 awk '/^status/ { print $2 }')
2958 [ "$cur_status" == "completed" ] ||
2959 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2963 local repaired=$($SHOW_LAYOUT |
2964 awk '/^repaired_orphan/ { print $2 }')
2965 [ $repaired -eq 2 ] ||
2966 error "(5) Fail to repair crashed PFL range: $repaired"
2968 echo "Data in $DIR/$tdir/f0 should not be broken"
2969 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2970 error "(6) Data in $DIR/$tdir/f0 is broken"
2972 echo "Write should succeed after LFSCK repairing the bad PFL range"
2973 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2974 error "(7) Write should succeed after LFSCK"
2976 run_test 18h "LFSCK can repair crashed PFL extent range"
2978 $LCTL set_param debug=-cache > /dev/null
2981 check_mount_and_prep
2982 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2984 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2985 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2987 echo "foo1" > $DIR/$tdir/a0
2988 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2989 error "(0) Fail to create PFL $DIR/$tdir/a1"
2990 echo "foo2" > $DIR/$tdir/a1
2991 echo "guard" > $DIR/$tdir/a2
2992 cancel_lru_locks osc
2994 echo "Inject failure, then client will offer wrong parent FID when read"
2995 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2996 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2998 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2999 $LCTL set_param fail_loc=0x1619
3001 echo "Read RPC with wrong parent FID should be denied"
3002 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3003 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3004 $LCTL set_param fail_loc=0
3006 run_test 19a "OST-object inconsistency self detect"
3009 check_mount_and_prep
3010 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3012 echo "Inject failure stub to make the OST-object to back point to"
3013 echo "non-exist MDT-object"
3015 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3016 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3018 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3019 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3020 echo "foo1" > $DIR/$tdir/f0
3021 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3022 error "(0) Fail to create PFL $DIR/$tdir/f1"
3023 echo "foo2" > $DIR/$tdir/f1
3024 cancel_lru_locks osc
3025 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3027 do_facet ost1 $LCTL set_param -n \
3028 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3029 echo "Nothing should be fixed since self detect and repair is disabled"
3030 local repaired=$(do_facet ost1 $LCTL get_param -n \
3031 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3032 awk '/^repaired/ { print $2 }')
3033 [ $repaired -eq 0 ] ||
3034 error "(1) Expected 0 repaired, but got $repaired"
3036 echo "Read RPC with right parent FID should be accepted,"
3037 echo "and cause parent FID on OST to be fixed"
3039 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3040 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3042 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3043 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3045 repaired=$(do_facet ost1 $LCTL get_param -n \
3046 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3047 awk '/^repaired/ { print $2 }')
3048 [ $repaired -eq 2 ] ||
3049 error "(3) Expected 1 repaired, but got $repaired"
3051 run_test 19b "OST-object inconsistency self repair"
3053 PATTERN_WITH_HOLE="40000001"
3054 PATTERN_WITHOUT_HOLE="raid0"
3057 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3058 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3061 echo "The target MDT-object and some of its OST-object are lost."
3062 echo "The LFSCK should find out the left OST-objects and re-create"
3063 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3064 echo "with the partial OST-objects (LOV EA hole)."
3066 echo "New client can access the file with LOV EA hole via normal"
3067 echo "system tools or commands without crash the system."
3069 echo "For old client, even though it cannot access the file with"
3070 echo "LOV EA hole, it should not cause the system crash."
3073 check_mount_and_prep
3074 $LFS mkdir -i 0 $DIR/$tdir/a1
3075 if [ $OSTCOUNT -gt 2 ]; then
3076 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3079 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3083 # 256 blocks on the stripe0.
3084 # 1 block on the stripe1 for 2 OSTs case.
3085 # 256 blocks on the stripe1 for other cases.
3086 # 1 block on the stripe2 if OSTs > 2
3087 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3088 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3089 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3091 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3092 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3093 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3096 $LFS getstripe $DIR/$tdir/a1/f0
3098 $LFS getstripe $DIR/$tdir/a1/f1
3100 $LFS getstripe $DIR/$tdir/a1/f2
3102 if [ $OSTCOUNT -gt 2 ]; then
3103 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3104 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3106 $LFS getstripe $DIR/$tdir/a1/f3
3109 cancel_lru_locks osc
3111 echo "Inject failure..."
3112 echo "To simulate f0 lost MDT-object"
3113 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3114 do_facet mds1 $LCTL set_param fail_loc=0x1616
3115 rm -f $DIR/$tdir/a1/f0
3117 echo "To simulate f1 lost MDT-object and OST-object0"
3118 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3119 do_facet mds1 $LCTL set_param fail_loc=0x161a
3120 rm -f $DIR/$tdir/a1/f1
3122 echo "To simulate f2 lost MDT-object and OST-object1"
3123 do_facet mds1 $LCTL set_param fail_val=1
3124 rm -f $DIR/$tdir/a1/f2
3126 if [ $OSTCOUNT -gt 2 ]; then
3127 echo "To simulate f3 lost MDT-object and OST-object2"
3128 do_facet mds1 $LCTL set_param fail_val=2
3129 rm -f $DIR/$tdir/a1/f3
3132 umount_client $MOUNT
3135 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3137 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3138 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3140 for k in $(seq $MDSCOUNT); do
3141 # The LFSCK status query internal is 30 seconds. For the case
3142 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3143 # time to guarantee the status sync up.
3144 wait_update_facet mds${k} "$LCTL get_param -n \
3145 mdd.$(facet_svc mds${k}).lfsck_layout |
3146 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3147 error "(2) MDS${k} is not the expected 'completed'"
3150 for k in $(seq $OSTCOUNT); do
3151 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3152 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3153 awk '/^status/ { print $2 }')
3154 [ "$cur_status" == "completed" ] ||
3155 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3158 local repaired=$(do_facet mds1 $LCTL get_param -n \
3159 mdd.$(facet_svc mds1).lfsck_layout |
3160 awk '/^repaired_orphan/ { print $2 }')
3161 if [ $OSTCOUNT -gt 2 ]; then
3162 [ $repaired -eq 9 ] ||
3163 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3165 [ $repaired -eq 4 ] ||
3166 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3169 mount_client $MOUNT || error "(5.0) Fail to start client!"
3171 LOV_PATTERN_F_HOLE=0x40000000
3174 # ${fid0}-R-0 is the old f0
3176 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3177 echo "Check $name, which is the old f0"
3179 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3181 local pattern=$($LFS getstripe -L $name)
3182 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3183 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3185 local stripes=$($LFS getstripe -c $name)
3186 if [ $OSTCOUNT -gt 2 ]; then
3187 [ $stripes -eq 3 ] ||
3188 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3190 [ $stripes -eq 2 ] ||
3191 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3194 local size=$(stat $name | awk '/Size:/ { print $2 }')
3195 [ $size -eq $((4096 * $bcount)) ] ||
3196 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3198 cat $name > /dev/null || error "(5.5) cannot read $name"
3200 echo "dummy" >> $name || error "(5.6) cannot write $name"
3202 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3204 touch $name || error "(5.8) cannot touch $name"
3206 rm -f $name || error "(5.9) cannot unlink $name"
3209 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3211 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3212 if [ $OSTCOUNT -gt 2 ]; then
3213 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3215 echo "Check $name, it contains the old f1's stripe1"
3218 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3220 pattern=$($LFS getstripe -L $name)
3221 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3222 error "(6.2) expect pattern flag hole, but got $pattern"
3224 stripes=$($LFS getstripe -c $name)
3225 if [ $OSTCOUNT -gt 2 ]; then
3226 [ $stripes -eq 3 ] ||
3227 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3229 [ $stripes -eq 2 ] ||
3230 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3233 size=$(stat $name | awk '/Size:/ { print $2 }')
3234 [ $size -eq $((4096 * $bcount)) ] ||
3235 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3237 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3239 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3240 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3243 [ $failures -eq 256 ] ||
3244 error "(6.6) expect 256 IO failures, but get $failures"
3246 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3247 [ $size -eq $((4096 * $bcount)) ] ||
3248 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3250 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3251 error "(6.8) write to the LOV EA hole should fail"
3253 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3254 error "(6.9) write to normal stripe should NOT fail"
3256 echo "foo" >> $name && error "(6.10) append write $name should fail"
3258 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3260 touch $name || error "(6.12) cannot touch $name"
3262 rm -f $name || error "(6.13) cannot unlink $name"
3265 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3267 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3268 if [ $OSTCOUNT -gt 2 ]; then
3269 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3271 echo "Check $name, it contains the old f2's stripe0"
3274 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3276 pattern=$($LFS getstripe -L $name)
3277 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3278 error "(7.2) expect pattern flag hole, but got $pattern"
3280 stripes=$($LFS getstripe -c $name)
3281 size=$(stat $name | awk '/Size:/ { print $2 }')
3282 if [ $OSTCOUNT -gt 2 ]; then
3283 [ $stripes -eq 3 ] ||
3284 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3286 [ $size -eq $((4096 * $bcount)) ] ||
3287 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3289 cat $name > /dev/null &&
3290 error "(7.5.1) normal read $name should fail"
3292 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3293 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3295 [ $failures -eq 256 ] ||
3296 error "(7.6) expect 256 IO failures, but get $failures"
3298 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3299 [ $size -eq $((4096 * $bcount)) ] ||
3300 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3302 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3303 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3305 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3306 error "(7.8.1) write to normal stripe should NOT fail"
3308 echo "foo" >> $name &&
3309 error "(7.8.3) append write $name should fail"
3311 chown $RUNAS_ID:$RUNAS_GID $name ||
3312 error "(7.9.1) cannot chown on $name"
3314 touch $name || error "(7.10.1) cannot touch $name"
3316 [ $stripes -eq 2 ] ||
3317 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3320 [ $size -eq $((4096 * (256 + 0))) ] ||
3321 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3323 cat $name > /dev/null &&
3324 error "(7.5.2) normal read $name should fail"
3326 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3327 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3328 [ $failures -eq 256 ] ||
3329 error "(7.6.2) expect 256 IO failures, but get $failures"
3332 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3333 [ $size -eq $((4096 * $bcount)) ] ||
3334 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3336 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3337 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3339 chown $RUNAS_ID:$RUNAS_GID $name ||
3340 error "(7.9.2) cannot chown on $name"
3342 touch $name || error "(7.10.2) cannot touch $name"
3345 rm -f $name || error "(7.11) cannot unlink $name"
3347 [ $OSTCOUNT -le 2 ] && return
3350 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3352 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3353 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3355 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3357 pattern=$($LFS getstripe -L $name)
3358 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3359 error "(8.2) expect pattern flag hole, but got $pattern"
3361 stripes=$($LFS getstripe -c $name)
3362 [ $stripes -eq 3 ] ||
3363 error "(8.3) expect the stripe count is 3, but got $stripes"
3365 size=$(stat $name | awk '/Size:/ { print $2 }')
3367 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3368 error "(8.4) expect the size $((4096 * 512)), but got $size"
3370 cat $name > /dev/null &&
3371 error "(8.5) normal read $name should fail"
3373 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3374 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3376 [ $failures -eq 256 ] ||
3377 error "(8.6) expect 256 IO failures, but get $failures"
3380 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3381 [ $size -eq $((4096 * $bcount)) ] ||
3382 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3384 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3385 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3387 chown $RUNAS_ID:$RUNAS_GID $name ||
3388 error "(8.9) cannot chown on $name"
3390 touch $name || error "(8.10) cannot touch $name"
3392 rm -f $name || error "(8.11) cannot unlink $name"
3394 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3397 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3398 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3401 echo "The target MDT-object and some of its OST-object are lost."
3402 echo "The LFSCK should find out the left OST-objects and re-create"
3403 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3404 echo "with the partial OST-objects (LOV EA hole)."
3406 echo "New client can access the file with LOV EA hole via normal"
3407 echo "system tools or commands without crash the system - PFL case."
3410 check_mount_and_prep
3412 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3413 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3414 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3415 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3416 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3417 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3419 local bcount=$((256 * 3 + 1))
3421 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3422 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3423 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3425 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3426 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3427 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3430 $LFS getstripe $DIR/$tdir/f0
3432 $LFS getstripe $DIR/$tdir/f1
3434 $LFS getstripe $DIR/$tdir/f2
3436 cancel_lru_locks mdc
3437 cancel_lru_locks osc
3439 echo "Inject failure..."
3440 echo "To simulate f0 lost MDT-object"
3441 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3445 echo "To simulate the case of f1 lost MDT-object and "
3446 echo "the first OST-object in each PFL component"
3447 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3448 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3451 echo "To simulate the case of f2 lost MDT-object and "
3452 echo "the second OST-object in each PFL component"
3453 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3458 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3460 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3461 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3463 for k in $(seq $MDSCOUNT); do
3464 # The LFSCK status query internal is 30 seconds. For the case
3465 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3466 # time to guarantee the status sync up.
3467 wait_update_facet mds${k} "$LCTL get_param -n \
3468 mdd.$(facet_svc mds${k}).lfsck_layout |
3469 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3470 error "(4) MDS${k} is not the expected 'completed'"
3473 for k in $(seq $OSTCOUNT); do
3474 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3475 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3476 awk '/^status/ { print $2 }')
3477 [ "$cur_status" == "completed" ] ||
3478 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3481 local repaired=$(do_facet mds1 $LCTL get_param -n \
3482 mdd.$(facet_svc mds1).lfsck_layout |
3483 awk '/^repaired_orphan/ { print $2 }')
3484 [ $repaired -eq 8 ] ||
3485 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3488 # ${fid0}-R-0 is the old f0
3490 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3491 echo "Check $name, which is the old f0"
3493 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3495 local pattern=$($LFS getstripe -L -I1 $name)
3496 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3497 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3499 pattern=$($LFS getstripe -L -I2 $name)
3500 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3501 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3503 local stripes=$($LFS getstripe -c -I1 $name)
3504 [ $stripes -eq 2 ] ||
3505 error "(7.3.1) expect 2 stripes, but got $stripes"
3507 stripes=$($LFS getstripe -c -I2 $name)
3508 [ $stripes -eq 2 ] ||
3509 error "(7.3.2) expect 2 stripes, but got $stripes"
3511 local e_start=$($LFS getstripe -I1 $name |
3512 awk '/lcme_extent.e_start:/ { print $2 }')
3513 [ $e_start -eq 0 ] ||
3514 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3516 local e_end=$($LFS getstripe -I1 $name |
3517 awk '/lcme_extent.e_end:/ { print $2 }')
3518 [ $e_end -eq 2097152 ] ||
3519 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3521 e_start=$($LFS getstripe -I2 $name |
3522 awk '/lcme_extent.e_start:/ { print $2 }')
3523 [ $e_start -eq 2097152 ] ||
3524 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3526 e_end=$($LFS getstripe -I2 $name |
3527 awk '/lcme_extent.e_end:/ { print $2 }')
3528 [ "$e_end" = "EOF" ] ||
3529 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3531 local size=$(stat $name | awk '/Size:/ { print $2 }')
3532 [ $size -eq $((4096 * $bcount)) ] ||
3533 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3535 cat $name > /dev/null || error "(7.7) cannot read $name"
3537 echo "dummy" >> $name || error "(7.8) cannot write $name"
3539 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3541 touch $name || error "(7.10) cannot touch $name"
3543 rm -f $name || error "(7.11) cannot unlink $name"
3546 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3548 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3549 echo "Check $name, it contains f1's second OST-object in each COMP"
3551 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3553 pattern=$($LFS getstripe -L -I1 $name)
3554 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3555 error "(8.2.1) expect pattern flag hole, but got $pattern"
3557 pattern=$($LFS getstripe -L -I2 $name)
3558 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3559 error "(8.2.2) expect pattern flag hole, but got $pattern"
3561 stripes=$($LFS getstripe -c -I1 $name)
3562 [ $stripes -eq 2 ] ||
3563 error "(8.3.2) expect 2 stripes, but got $stripes"
3565 stripes=$($LFS getstripe -c -I2 $name)
3566 [ $stripes -eq 2 ] ||
3567 error "(8.3.2) expect 2 stripes, but got $stripes"
3569 e_start=$($LFS getstripe -I1 $name |
3570 awk '/lcme_extent.e_start:/ { print $2 }')
3571 [ $e_start -eq 0 ] ||
3572 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3574 e_end=$($LFS getstripe -I1 $name |
3575 awk '/lcme_extent.e_end:/ { print $2 }')
3576 [ $e_end -eq 2097152 ] ||
3577 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3579 e_start=$($LFS getstripe -I2 $name |
3580 awk '/lcme_extent.e_start:/ { print $2 }')
3581 [ $e_start -eq 2097152 ] ||
3582 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3584 e_end=$($LFS getstripe -I2 $name |
3585 awk '/lcme_extent.e_end:/ { print $2 }')
3586 [ "$e_end" = "EOF" ] ||
3587 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3589 size=$(stat $name | awk '/Size:/ { print $2 }')
3590 [ $size -eq $((4096 * $bcount)) ] ||
3591 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3593 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3595 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3596 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3598 # The first stripe in each COMP was lost
3599 [ $failures -eq 512 ] ||
3600 error "(8.8) expect 512 IO failures, but get $failures"
3602 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3603 [ $size -eq $((4096 * $bcount)) ] ||
3604 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3606 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3607 error "(8.10) write to the LOV EA hole should fail"
3609 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3610 error "(8.11) write to normal stripe should NOT fail"
3612 echo "foo" >> $name && error "(8.12) append write $name should fail"
3614 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3616 touch $name || error "(8.14) cannot touch $name"
3618 rm -f $name || error "(8.15) cannot unlink $name"
3621 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3623 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3624 echo "Check $name, it contains f2's first stripe in each COMP"
3626 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3628 pattern=$($LFS getstripe -L -I1 $name)
3629 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3630 error "(9.2.1) expect pattern flag hole, but got $pattern"
3632 pattern=$($LFS getstripe -L -I2 $name)
3633 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3634 error "(9.2.2) expect pattern flag hole, but got $pattern"
3636 stripes=$($LFS getstripe -c -I1 $name)
3637 [ $stripes -eq 2 ] ||
3638 error "(9.3.2) expect 2 stripes, but got $stripes"
3640 stripes=$($LFS getstripe -c -I2 $name)
3641 [ $stripes -eq 2 ] ||
3642 error "(9.3.2) expect 2 stripes, but got $stripes"
3644 e_start=$($LFS getstripe -I1 $name |
3645 awk '/lcme_extent.e_start:/ { print $2 }')
3646 [ $e_start -eq 0 ] ||
3647 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3649 e_end=$($LFS getstripe -I1 $name |
3650 awk '/lcme_extent.e_end:/ { print $2 }')
3651 [ $e_end -eq 2097152 ] ||
3652 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3654 e_start=$($LFS getstripe -I2 $name |
3655 awk '/lcme_extent.e_start:/ { print $2 }')
3656 [ $e_start -eq 2097152 ] ||
3657 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3659 e_end=$($LFS getstripe -I2 $name |
3660 awk '/lcme_extent.e_end:/ { print $2 }')
3661 [ "$e_end" = "EOF" ] ||
3662 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3664 size=$(stat $name | awk '/Size:/ { print $2 }')
3665 # The second stripe in COMP was lost, so we do not know there
3666 # have ever been some data before. 'stat' will regard it as
3667 # no data on the lost stripe.
3669 [ $size -eq $((4096 * $bcount)) ] ||
3670 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3672 cat $name > /dev/null &&
3673 error "(9.7) normal read $name should fail"
3675 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3676 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3677 [ $failures -eq 512 ] ||
3678 error "(9.8) expect 256 IO failures, but get $failures"
3680 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3681 # The second stripe in COMP was lost, so we do not know there
3682 # have ever been some data before. Since 'dd' skip failure,
3683 # it will regard the lost stripe contains data.
3685 [ $size -eq $((4096 * $bcount)) ] ||
3686 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3688 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3689 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3691 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3692 error "(9.11) write to normal stripe should NOT fail"
3694 echo "foo" >> $name &&
3695 error "(9.12) append write $name should fail"
3697 chown $RUNAS_ID:$RUNAS_GID $name ||
3698 error "(9.13) cannot chown on $name"
3700 touch $name || error "(9.14) cannot touch $name"
3702 rm -f $name || error "(7.15) cannot unlink $name"
3704 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3707 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3708 skip "ignore the test if MDS is older than 2.5.59" && return
3710 check_mount_and_prep
3711 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3713 echo "Start all LFSCK components by default (-s 1)"
3714 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3715 error "Fail to start LFSCK"
3717 echo "namespace LFSCK should be in 'scanning-phase1' status"
3718 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3719 [ "$STATUS" == "scanning-phase1" ] ||
3720 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3722 echo "layout LFSCK should be in 'scanning-phase1' status"
3723 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3724 [ "$STATUS" == "scanning-phase1" ] ||
3725 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3727 echo "Stop all LFSCK components by default"
3728 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3729 error "Fail to stop LFSCK"
3731 run_test 21 "run all LFSCK components by default"
3734 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3737 echo "The parent_A references the child directory via some name entry,"
3738 echo "but the child directory back references another parent_B via its"
3739 echo "".." name entry. The parent_B does not exist. Then the namespace"
3740 echo "LFSCK will repair the child directory's ".." name entry."
3743 check_mount_and_prep
3745 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3746 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3748 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3749 echo "The dummy's dotdot name entry references the guard."
3750 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3751 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3752 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3753 error "(3) Fail to mkdir on MDT0"
3754 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3756 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3758 echo "Trigger namespace LFSCK to repair unmatched pairs"
3759 $START_NAMESPACE -A -r ||
3760 error "(5) Fail to start LFSCK for namespace"
3762 wait_all_targets_blocked namespace completed 6
3764 local repaired=$($SHOW_NAMESPACE |
3765 awk '/^unmatched_pairs_repaired/ { print $2 }')
3766 [ $repaired -eq 1 ] ||
3767 error "(7) Fail to repair unmatched pairs: $repaired"
3769 echo "'ls' should success after namespace LFSCK repairing"
3770 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3771 error "(8) ls should success."
3773 run_test 22a "LFSCK can repair unmatched pairs (1)"
3776 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3779 echo "The parent_A references the child directory via the name entry_B,"
3780 echo "but the child directory back references another parent_C via its"
3781 echo "".." name entry. The parent_C exists, but there is no the name"
3782 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3783 echo "the child directory's ".." name entry and its linkEA."
3786 check_mount_and_prep
3788 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3789 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3791 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3792 echo "and bad linkEA. The dummy's dotdot name entry references the"
3793 echo "guard. The dummy's linkEA references n non-exist name entry."
3794 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3795 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3796 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3797 error "(3) Fail to mkdir on MDT0"
3798 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3800 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3801 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3802 local dummyname=$($LFS fid2path $DIR $dummyfid)
3803 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3804 error "(4) fid2path works unexpectedly."
3806 echo "Trigger namespace LFSCK to repair unmatched pairs"
3807 $START_NAMESPACE -A -r ||
3808 error "(5) Fail to start LFSCK for namespace"
3810 wait_all_targets_blocked namespace completed 6
3812 local repaired=$($SHOW_NAMESPACE |
3813 awk '/^unmatched_pairs_repaired/ { print $2 }')
3814 [ $repaired -eq 1 ] ||
3815 error "(7) Fail to repair unmatched pairs: $repaired"
3817 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3818 local dummyname=$($LFS fid2path $DIR $dummyfid)
3819 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3820 error "(8) fid2path does not work"
3822 run_test 22b "LFSCK can repair unmatched pairs (2)"
3825 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3828 echo "The name entry is there, but the MDT-object for such name "
3829 echo "entry does not exist. The namespace LFSCK should find out "
3830 echo "and repair the inconsistency as required."
3833 check_mount_and_prep
3835 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3836 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3838 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3839 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3840 do_facet mds2 $LCTL set_param fail_loc=0x1620
3841 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3842 do_facet mds2 $LCTL set_param fail_loc=0
3844 echo "'ls' should fail because of dangling name entry"
3845 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3847 echo "Trigger namespace LFSCK to find out dangling name entry"
3848 $START_NAMESPACE -A -r ||
3849 error "(5) Fail to start LFSCK for namespace"
3851 wait_all_targets_blocked namespace completed 6
3853 local repaired=$($SHOW_NAMESPACE |
3854 awk '/^dangling_repaired/ { print $2 }')
3855 [ $repaired -eq 1 ] ||
3856 error "(7) Fail to repair dangling name entry: $repaired"
3858 echo "'ls' should fail because not re-create MDT-object by default"
3859 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3861 echo "Trigger namespace LFSCK again to repair dangling name entry"
3862 $START_NAMESPACE -A -r -C ||
3863 error "(9) Fail to start LFSCK for namespace"
3865 wait_all_targets_blocked namespace completed 10
3867 repaired=$($SHOW_NAMESPACE |
3868 awk '/^dangling_repaired/ { print $2 }')
3869 [ $repaired -eq 1 ] ||
3870 error "(11) Fail to repair dangling name entry: $repaired"
3872 echo "'ls' should success after namespace LFSCK repairing"
3873 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3875 run_test 23a "LFSCK can repair dangling name entry (1)"
3879 echo "The objectA has multiple hard links, one of them corresponding"
3880 echo "to the name entry_B. But there is something wrong for the name"
3881 echo "entry_B and cause entry_B to references non-exist object_C."
3882 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3883 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3884 echo "comes to the second-stage scanning, it will find that the"
3885 echo "former re-creating object_C is not proper, and will try to"
3886 echo "replace the object_C with the real object_A."
3889 check_mount_and_prep
3891 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3892 $LFS path2fid $DIR/$tdir/d0
3894 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3896 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3897 $LFS path2fid $DIR/$tdir/d0/f0
3899 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3900 $LFS path2fid $DIR/$tdir/d0/f1
3902 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3903 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3905 if [ "$SEQ0" != "$SEQ1" ]; then
3906 # To guarantee that the f0 and f1 are in the same FID seq
3907 rm -f $DIR/$tdir/d0/f0 ||
3908 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3909 echo "dummy" > $DIR/$tdir/d0/f0 ||
3910 error "(3.2) Fail to touch on MDT0"
3911 $LFS path2fid $DIR/$tdir/d0/f0
3914 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3915 OID=$(printf %d $OID)
3917 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3918 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3919 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3920 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3921 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3923 # If there is creation after the dangling injection, it may re-use
3924 # the just released local object (inode) that is referenced by the
3925 # dangling name entry. It will fail the dangling injection.
3926 # So before deleting the target object for the dangling name entry,
3927 # remove some other objects to avoid the target object being reused
3928 # by some potential creations. LU-7429
3929 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3931 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3933 echo "'ls' should fail because of dangling name entry"
3934 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3935 error "(6) ls should fail."
3937 echo "Trigger namespace LFSCK to find out dangling name entry"
3938 $START_NAMESPACE -r -C ||
3939 error "(7) Fail to start LFSCK for namespace"
3941 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3942 mdd.${MDT_DEV}.lfsck_namespace |
3943 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3945 error "(8) unexpected status"
3948 local repaired=$($SHOW_NAMESPACE |
3949 awk '/^dangling_repaired/ { print $2 }')
3950 [ $repaired -eq 1 ] ||
3951 error "(9) Fail to repair dangling name entry: $repaired"
3953 repaired=$($SHOW_NAMESPACE |
3954 awk '/^multiple_linked_repaired/ { print $2 }')
3955 [ $repaired -eq 1 ] ||
3956 error "(10) Fail to drop the former created object: $repaired"
3958 local data=$(cat $DIR/$tdir/d0/foo)
3959 [ "$data" == "dummy" ] ||
3960 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3962 run_test 23b "LFSCK can repair dangling name entry (2)"
3965 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3966 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3967 mdd.${MDT_DEV}.lfsck_namespace |
3968 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3970 error "(10) unexpected status"
3973 stop_full_debug_logging
3978 echo "The objectA has multiple hard links, one of them corresponding"
3979 echo "to the name entry_B. But there is something wrong for the name"
3980 echo "entry_B and cause entry_B to references non-exist object_C."
3981 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3982 echo "as dangling, and re-create the lost object_C. And then others"
3983 echo "modified the re-created object_C. When the LFSCK comes to the"
3984 echo "second-stage scanning, it will find that the former re-creating"
3985 echo "object_C maybe wrong and try to replace the object_C with the"
3986 echo "real object_A. But because object_C has been modified, so the"
3987 echo "LFSCK cannot replace it."
3990 start_full_debug_logging
3992 check_mount_and_prep
3994 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3995 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3996 echo "parent_fid=$parent_fid"
3998 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4000 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4001 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4002 echo "f0_fid=$f0_fid"
4004 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4005 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4006 echo "f1_fid=$f1_fid"
4008 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4009 # To guarantee that the f0 and f1 are in the same FID seq
4010 rm -f $DIR/$tdir/d0/f0 ||
4011 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4012 echo "dummy" > $DIR/$tdir/d0/f0 ||
4013 error "(3.2) Fail to touch on MDT0"
4014 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4015 echo "f0_fid=$f0_fid (replaced)"
4018 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4020 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4021 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4022 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4023 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4024 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4026 # If there is creation after the dangling injection, it may re-use
4027 # the just released local object (inode) that is referenced by the
4028 # dangling name entry. It will fail the dangling injection.
4029 # So before deleting the target object for the dangling name entry,
4030 # remove some other objects to avoid the target object being reused
4031 # by some potential creations. LU-7429
4032 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4034 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4036 echo "'ls' should fail because of dangling name entry"
4037 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4038 error "(6) ls should fail."
4040 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4041 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4043 echo "Trigger namespace LFSCK to find out dangling name entry"
4044 $START_NAMESPACE -r -C ||
4045 error "(7) Fail to start LFSCK for namespace"
4047 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4048 # While unexpected by the test, it is valid for LFSCK to repair
4049 # the link to the original object before any data is written.
4050 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4052 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4053 log "LFSCK repaired file prematurely"
4058 stat $DIR/$tdir/d0/foo
4060 error "(8) unexpected size"
4063 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4064 cancel_lru_locks osc
4068 local repaired=$($SHOW_NAMESPACE |
4069 awk '/^dangling_repaired/ { print $2 }')
4070 [ $repaired -eq 1 ] ||
4071 error "(11) Fail to repair dangling name entry: $repaired"
4073 local data=$(cat $DIR/$tdir/d0/foo)
4074 [ "$data" != "dummy" ] ||
4075 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4077 run_test 23c "LFSCK can repair dangling name entry (3)"
4080 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4081 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4084 echo "Two MDT-objects back reference the same name entry via their"
4085 echo "each own linkEA entry, but the name entry only references one"
4086 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4087 echo "for the MDT-object that is not recognized. If such MDT-object"
4088 echo "has no other linkEA entry after the removing, then the LFSCK"
4089 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4092 check_mount_and_prep
4094 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4096 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4097 $LFS path2fid $DIR/$tdir/d0/guard
4099 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4100 $LFS path2fid $DIR/$tdir/d0/dummy
4103 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4104 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4106 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4109 touch $DIR/$tdir/d0/guard/foo ||
4110 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4112 echo "Inject failure stub on MDT0 to simulate the case that"
4113 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4114 echo "that references $DIR/$tdir/d0/guard/foo."
4115 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4116 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4117 echo "there with the same linkEA entry as another MDT-object"
4118 echo "$DIR/$tdir/d0/guard/foo has"
4120 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4121 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4122 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4123 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4124 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4125 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4126 rmdir $DIR/$tdir/d0/dummy/foo ||
4127 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4128 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4130 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4131 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4132 error "(6) stat successfully unexpectedly"
4134 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4135 $START_NAMESPACE -A -r ||
4136 error "(7) Fail to start LFSCK for namespace"
4138 wait_all_targets_blocked namespace completed 8
4140 local repaired=$($SHOW_NAMESPACE |
4141 awk '/^multiple_referenced_repaired/ { print $2 }')
4142 [ $repaired -eq 1 ] ||
4143 error "(9) Fail to repair multiple referenced name entry: $repaired"
4145 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4146 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4147 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4149 local cname="$cfid-$pfid-D-0"
4150 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4151 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4153 run_test 24 "LFSCK can repair multiple-referenced name entry"
4156 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4157 skip "ldiskfs only test" && return
4160 echo "The file type in the name entry does not match the file type"
4161 echo "claimed by the referenced object. Then the LFSCK will update"
4162 echo "the file type in the name entry."
4165 check_mount_and_prep
4167 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4169 echo "Inject failure stub on MDT0 to simulate the case that"
4170 echo "the file type stored in the name entry is wrong."
4172 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4173 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4174 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4175 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4177 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4178 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4180 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4181 mdd.${MDT_DEV}.lfsck_namespace |
4182 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4184 error "(4) unexpected status"
4187 local repaired=$($SHOW_NAMESPACE |
4188 awk '/^bad_file_type_repaired/ { print $2 }')
4189 [ $repaired -eq 1 ] ||
4190 error "(5) Fail to repair bad file type in name entry: $repaired"
4192 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4194 run_test 25 "LFSCK can repair bad file type in the name entry"
4198 echo "The local name entry back referenced by the MDT-object is lost."
4199 echo "The namespace LFSCK will add the missing local name entry back"
4200 echo "to the normal namespace."
4203 check_mount_and_prep
4205 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4206 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4207 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4209 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4210 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4212 echo "Inject failure stub on MDT0 to simulate the case that"
4213 echo "foo's name entry will be removed, but the foo's object"
4214 echo "and its linkEA are kept in the system."
4216 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4217 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4218 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4221 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4222 error "(5) 'ls' should fail"
4224 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4225 $START_NAMESPACE -r -A ||
4226 error "(6) Fail to start LFSCK for namespace"
4228 wait_all_targets_blocked namespace completed 7
4230 local repaired=$($SHOW_NAMESPACE |
4231 awk '/^lost_dirent_repaired/ { print $2 }')
4232 [ $repaired -eq 1 ] ||
4233 error "(8) Fail to repair lost dirent: $repaired"
4235 ls -ail $DIR/$tdir/d0/foo ||
4236 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4238 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4239 [ "$foofid" == "$foofid2" ] ||
4240 error "(10) foo's FID changed: $foofid, $foofid2"
4242 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4245 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4248 echo "The remote name entry back referenced by the MDT-object is lost."
4249 echo "The namespace LFSCK will add the missing remote name entry back"
4250 echo "to the normal namespace."
4253 check_mount_and_prep
4255 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4256 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4257 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4259 echo "Inject failure stub on MDT0 to simulate the case that"
4260 echo "foo's name entry will be removed, but the foo's object"
4261 echo "and its linkEA are kept in the system."
4263 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4264 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4265 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4268 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4269 error "(4) 'ls' should fail"
4271 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4272 $START_NAMESPACE -r -A ||
4273 error "(5) Fail to start LFSCK for namespace"
4275 wait_all_targets_blocked namespace completed 6
4277 local repaired=$($SHOW_NAMESPACE |
4278 awk '/^lost_dirent_repaired/ { print $2 }')
4279 [ $repaired -eq 1 ] ||
4280 error "(7) Fail to repair lost dirent: $repaired"
4282 ls -ail $DIR/$tdir/d0/foo ||
4283 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4285 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4286 [ "$foofid" == "$foofid2" ] ||
4287 error "(9) foo's FID changed: $foofid, $foofid2"
4289 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4292 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4295 echo "The local parent referenced by the MDT-object linkEA is lost."
4296 echo "The namespace LFSCK will re-create the lost parent as orphan."
4299 check_mount_and_prep
4301 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4302 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4303 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4304 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4306 echo "Inject failure stub on MDT0 to simulate the case that"
4307 echo "foo's name entry will be removed, but the foo's object"
4308 echo "and its linkEA are kept in the system. And then remove"
4309 echo "another hard link and the parent directory."
4311 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4312 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4313 rm -f $DIR/$tdir/d0/foo ||
4314 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4315 rm -f $DIR/$tdir/d0/dummy ||
4316 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4317 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4319 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4320 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4322 echo "Trigger namespace LFSCK to repair the lost parent"
4323 $START_NAMESPACE -r -A ||
4324 error "(6) Fail to start LFSCK for namespace"
4326 wait_all_targets_blocked namespace completed 7
4328 local repaired=$($SHOW_NAMESPACE |
4329 awk '/^lost_dirent_repaired/ { print $2 }')
4330 [ $repaired -eq 1 ] ||
4331 error "(8) Fail to repair lost dirent: $repaired"
4333 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4334 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4335 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4337 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4339 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4340 [ ! -z "$cname" ] ||
4341 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4343 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4346 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4347 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4350 echo "The remote parent referenced by the MDT-object linkEA is lost."
4351 echo "The namespace LFSCK will re-create the lost parent as orphan."
4354 check_mount_and_prep
4356 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4357 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4359 $LFS path2fid $DIR/$tdir/d0
4361 echo "Inject failure stub on MDT0 to simulate the case that"
4362 echo "foo's name entry will be removed, but the foo's object"
4363 echo "and its linkEA are kept in the system. And then remove"
4364 echo "the parent directory."
4366 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4367 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4368 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4369 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4371 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4372 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4374 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4375 $START_NAMESPACE -r -A ||
4376 error "(6) Fail to start LFSCK for namespace"
4378 wait_all_targets_blocked namespace completed 7
4380 local repaired=$($SHOW_NAMESPACE |
4381 awk '/^lost_dirent_repaired/ { print $2 }')
4382 [ $repaired -eq 1 ] ||
4383 error "(8) Fail to repair lost dirent: $repaired"
4385 ls -ail $MOUNT/.lustre/lost+found/
4387 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4388 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4389 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4391 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4393 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4394 [ ! -z "$cname" ] ||
4395 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4397 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4400 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4403 echo "The target name entry is lost. The LFSCK should insert the"
4404 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4405 echo "the MDT (on which the orphan MDT-object resides) has ever"
4406 echo "failed to respond some name entry verification during the"
4407 echo "first stage-scanning, then the LFSCK should skip to handle"
4408 echo "orphan MDT-object on this MDT. But other MDTs should not"
4412 check_mount_and_prep
4413 $LFS mkdir -i 0 $DIR/$tdir/d1
4414 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4415 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4417 $LFS mkdir -i 1 $DIR/$tdir/d2
4418 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4419 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4421 echo "Inject failure stub on MDT0 to simulate the case that"
4422 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4423 echo "and its linkEA are kept in the system. And the case that"
4424 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4425 echo "and its linkEA are kept in the system."
4427 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4428 do_facet mds1 $LCTL set_param fail_loc=0x1624
4429 do_facet mds2 $LCTL set_param fail_loc=0x1624
4430 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4431 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4432 do_facet mds1 $LCTL set_param fail_loc=0
4433 do_facet mds2 $LCTL set_param fail_loc=0
4435 cancel_lru_locks mdc
4436 cancel_lru_locks osc
4438 echo "Inject failure, to simulate the MDT0 fail to handle"
4439 echo "MDT1 LFSCK request during the first-stage scanning."
4440 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4441 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4443 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4444 $START_NAMESPACE -r -A ||
4445 error "(3) Fail to start LFSCK for namespace"
4447 wait_update_facet mds1 "$LCTL get_param -n \
4448 mdd.$(facet_svc mds1).lfsck_namespace |
4449 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4450 error "(4) mds1 is not the expected 'partial'"
4453 wait_update_facet mds2 "$LCTL get_param -n \
4454 mdd.$(facet_svc mds2).lfsck_namespace |
4455 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4456 error "(5) mds2 is not the expected 'completed'"
4459 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4461 local repaired=$(do_facet mds1 $LCTL get_param -n \
4462 mdd.$(facet_svc mds1).lfsck_namespace |
4463 awk '/^lost_dirent_repaired/ { print $2 }')
4464 [ $repaired -eq 0 ] ||
4465 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4467 repaired=$(do_facet mds2 $LCTL get_param -n \
4468 mdd.$(facet_svc mds2).lfsck_namespace |
4469 awk '/^lost_dirent_repaired/ { print $2 }')
4470 [ $repaired -eq 1 ] ||
4471 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4473 echo "Trigger namespace LFSCK on all devices again to cleanup"
4474 $START_NAMESPACE -r -A ||
4475 error "(8) Fail to start LFSCK for namespace"
4477 wait_all_targets_blocked namespace completed 9
4479 local repaired=$(do_facet mds1 $LCTL get_param -n \
4480 mdd.$(facet_svc mds1).lfsck_namespace |
4481 awk '/^lost_dirent_repaired/ { print $2 }')
4482 [ $repaired -eq 1 ] ||
4483 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4485 repaired=$(do_facet mds2 $LCTL get_param -n \
4486 mdd.$(facet_svc mds2).lfsck_namespace |
4487 awk '/^lost_dirent_repaired/ { print $2 }')
4488 [ $repaired -eq 0 ] ||
4489 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4491 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4495 echo "The object's nlink attribute is larger than the object's known"
4496 echo "name entries count. The LFSCK will repair the object's nlink"
4497 echo "attribute to match the known name entries count"
4500 check_mount_and_prep
4502 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4503 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4505 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4506 echo "nlink attribute is larger than its name entries count."
4508 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4509 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4510 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4511 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4512 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4514 cancel_lru_locks mdc
4515 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4516 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4518 echo "Trigger namespace LFSCK to repair the nlink count"
4519 $START_NAMESPACE -r -A ||
4520 error "(5) Fail to start LFSCK for namespace"
4522 wait_all_targets_blocked namespace completed 6
4524 local repaired=$($SHOW_NAMESPACE |
4525 awk '/^nlinks_repaired/ { print $2 }')
4526 [ $repaired -eq 1 ] ||
4527 error "(7) Fail to repair nlink count: $repaired"
4529 cancel_lru_locks mdc
4530 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4531 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4533 # Disable 29a, we only allow nlink to be updated if the known linkEA
4534 # entries is larger than nlink count.
4536 #run_test 29a "LFSCK can repair bad nlink count (1)"
4540 echo "The object's nlink attribute is smaller than the object's known"
4541 echo "name entries count. The LFSCK will repair the object's nlink"
4542 echo "attribute to match the known name entries count"
4545 check_mount_and_prep
4547 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4548 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4550 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4551 echo "nlink attribute is smaller than its name entries count."
4553 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4555 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4556 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4557 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4559 cancel_lru_locks mdc
4560 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4561 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4563 echo "Trigger namespace LFSCK to repair the nlink count"
4564 $START_NAMESPACE -r -A ||
4565 error "(5) Fail to start LFSCK for namespace"
4567 wait_all_targets_blocked namespace completed 6
4569 local repaired=$($SHOW_NAMESPACE |
4570 awk '/^nlinks_repaired/ { print $2 }')
4571 [ $repaired -eq 1 ] ||
4572 error "(7) Fail to repair nlink count: $repaired"
4574 cancel_lru_locks mdc
4575 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4576 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4578 run_test 29b "LFSCK can repair bad nlink count (2)"
4583 echo "The namespace LFSCK will create many hard links to the target"
4584 echo "file as to exceed the linkEA size limitation. Under such case"
4585 echo "the linkEA will be marked as overflow that will prevent the"
4586 echo "target file to be migrated. Then remove some hard links to"
4587 echo "make the left hard links to be held within the linkEA size"
4588 echo "limitation. But before the namespace LFSCK adding all the"
4589 echo "missed linkEA entries back, the overflow mark (timestamp)"
4590 echo "will not be cleared."
4593 check_mount_and_prep
4595 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4596 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4597 error "(0.2) Fail to mkdir"
4598 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4599 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4601 # define MAX_LINKEA_SIZE 4096
4602 # sizeof(link_ea_header) = 24
4603 # sizeof(link_ea_entry) = 18
4604 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4605 # (sizeof(link_ea_entry) + name_length))
4606 # If the average name length is 12 bytes, then 150 hard links
4607 # is totally enough to overflow the linkEA
4608 echo "Create 150 hard links should succeed although the linkEA overflow"
4609 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4610 error "(2) Fail to hard link"
4612 cancel_lru_locks mdc
4613 if [ $MDSCOUNT -ge 2 ]; then
4614 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4615 error "(3.1) Migrate should fail"
4617 echo "The object with linkEA overflow should NOT be migrated"
4618 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4619 [ "$newfid" == "$oldfid" ] ||
4620 error "(3.2) Migrate should fail: $newfid != $oldfid"
4623 # Remove 100 hard links, then the linkEA should have space
4624 # to hold the missed linkEA entries.
4625 echo "Remove 100 hard links to save space for the missed linkEA entries"
4626 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4628 if [ $MDSCOUNT -ge 2 ]; then
4629 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4630 error "(5.1) Migrate should fail"
4632 # The overflow timestamp is still there, so migration will fail.
4633 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4634 [ "$newfid" == "$oldfid" ] ||
4635 error "(5.2) Migrate should fail: $newfid != $oldfid"
4638 # sleep 3 seconds to guarantee that the overflow is recognized
4641 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4642 $START_NAMESPACE -r -A ||
4643 error "(6) Fail to start LFSCK for namespace"
4645 wait_all_targets_blocked namespace completed 7
4647 local repaired=$($SHOW_NAMESPACE |
4648 awk '/^linkea_overflow_cleared/ { print $2 }')
4649 [ $repaired -eq 1 ] ||
4650 error "(8) Fail to clear linkea overflow: $repaired"
4652 repaired=$($SHOW_NAMESPACE |
4653 awk '/^nlinks_repaired/ { print $2 }')
4654 [ $repaired -eq 0 ] ||
4655 error "(9) Unexpected nlink repaired: $repaired"
4657 if [ $MDSCOUNT -ge 2 ]; then
4658 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4659 error "(10.1) Migrate failure"
4661 # Migration should succeed after clear the overflow timestamp.
4662 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4663 [ "$newfid" != "$oldfid" ] ||
4664 error "(10.2) Migrate should succeed"
4666 ls -l $DIR/$tdir/foo > /dev/null ||
4667 error "(11) 'ls' failed after migration"
4670 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4671 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4673 run_test 29c "verify linkEA size limitation"
4676 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4677 skip "ldiskfs only test" && return
4678 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4681 echo "The namespace LFSCK will move the orphans from backend"
4682 echo "/lost+found directory to normal client visible namespace"
4683 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4686 check_mount_and_prep
4688 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4689 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4691 echo "Inject failure stub on MDT0 to simulate the case that"
4692 echo "directory d0 has no linkEA entry, then the LFSCK will"
4693 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4695 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4696 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4697 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4700 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4701 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4703 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4704 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4706 echo "Inject failure stub on MDT0 to simulate the case that the"
4707 echo "object's name entry will be removed, but not destroy the"
4708 echo "object. Then backend e2fsck will handle it as orphan and"
4709 echo "add them into the backend /lost+found directory."
4711 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4712 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4713 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4714 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4715 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4716 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4717 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4719 umount_client $MOUNT || error "(10) Fail to stop client!"
4721 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4724 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4725 error "(12) Fail to run e2fsck"
4727 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4728 error "(13) Fail to start MDT0"
4730 echo "Trigger namespace LFSCK to recover backend orphans"
4731 $START_NAMESPACE -r -A ||
4732 error "(14) Fail to start LFSCK for namespace"
4734 wait_all_targets_blocked namespace completed 15
4736 local repaired=$($SHOW_NAMESPACE |
4737 awk '/^local_lost_found_moved/ { print $2 }')
4738 [ $repaired -ge 4 ] ||
4739 error "(16) Fail to recover backend orphans: $repaired"
4741 mount_client $MOUNT || error "(17) Fail to start client!"
4743 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4745 ls -ail $MOUNT/.lustre/lost+found/
4747 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4748 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4749 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4751 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4753 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4754 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4756 stat ${cname}/d1 || error "(21) d1 is not recovered"
4757 stat ${cname}/f1 || error "(22) f1 is not recovered"
4759 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4762 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4765 echo "For the name entry under a striped directory, if the name"
4766 echo "hash does not match the shard, then the LFSCK will repair"
4767 echo "the bad name entry"
4770 check_mount_and_prep
4772 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4773 error "(1) Fail to create striped directory"
4775 echo "Inject failure stub on client to simulate the case that"
4776 echo "some name entry should be inserted into other non-first"
4777 echo "shard, but inserted into the first shard by wrong"
4779 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4780 $LCTL set_param fail_loc=0x1628 fail_val=0
4781 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4782 error "(2) Fail to create file under striped directory"
4783 $LCTL set_param fail_loc=0 fail_val=0
4785 echo "Trigger namespace LFSCK to repair bad name hash"
4786 $START_NAMESPACE -r -A ||
4787 error "(3) Fail to start LFSCK for namespace"
4789 wait_all_targets_blocked namespace completed 4
4791 local repaired=$($SHOW_NAMESPACE |
4792 awk '/^name_hash_repaired/ { print $2 }')
4793 [ $repaired -ge 1 ] ||
4794 error "(5) Fail to repair bad name hash: $repaired"
4796 umount_client $MOUNT || error "(6) umount failed"
4797 mount_client $MOUNT || error "(7) mount failed"
4799 for ((i = 0; i < $MDSCOUNT; i++)); do
4800 stat $DIR/$tdir/striped_dir/d$i ||
4801 error "(8) Fail to stat d$i after LFSCK"
4802 rmdir $DIR/$tdir/striped_dir/d$i ||
4803 error "(9) Fail to unlink d$i after LFSCK"
4806 rmdir $DIR/$tdir/striped_dir ||
4807 error "(10) Fail to remove the striped directory after LFSCK"
4809 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4812 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4815 echo "For the name entry under a striped directory, if the name"
4816 echo "hash does not match the shard, then the LFSCK will repair"
4817 echo "the bad name entry"
4820 check_mount_and_prep
4822 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4823 error "(1) Fail to create striped directory"
4825 echo "Inject failure stub on client to simulate the case that"
4826 echo "some name entry should be inserted into other non-second"
4827 echo "shard, but inserted into the secod shard by wrong"
4829 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4830 $LCTL set_param fail_loc=0x1628 fail_val=1
4831 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4832 error "(2) Fail to create file under striped directory"
4833 $LCTL set_param fail_loc=0 fail_val=0
4835 echo "Trigger namespace LFSCK to repair bad name hash"
4836 $START_NAMESPACE -r -A ||
4837 error "(3) Fail to start LFSCK for namespace"
4839 wait_all_targets_blocked namespace completed 4
4841 local repaired=$(do_facet mds2 $LCTL get_param -n \
4842 mdd.$(facet_svc mds2).lfsck_namespace |
4843 awk '/^name_hash_repaired/ { print $2 }')
4844 echo "repaired $repaired name entries with bad hash"
4845 [ $repaired -ge 1 ] ||
4846 error "(5) Fail to repair bad name hash: $repaired"
4848 umount_client $MOUNT || error "(6) umount failed"
4849 mount_client $MOUNT || error "(7) mount failed"
4851 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4852 stat $DIR/$tdir/striped_dir/d$i ||
4853 error "(8) Fail to stat d$i after LFSCK"
4854 rmdir $DIR/$tdir/striped_dir/d$i ||
4855 error "(9) Fail to unlink d$i after LFSCK"
4858 rmdir $DIR/$tdir/striped_dir ||
4859 error "(10) Fail to remove the striped directory after LFSCK"
4861 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4864 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4867 echo "For some reason, the master MDT-object of the striped directory"
4868 echo "may lost its master LMV EA. If nobody created files under the"
4869 echo "master directly after the master LMV EA lost, then the LFSCK"
4870 echo "should re-generate the master LMV EA."
4873 check_mount_and_prep
4875 echo "Inject failure stub on MDT0 to simulate the case that the"
4876 echo "master MDT-object of the striped directory lost the LMV EA."
4878 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4879 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4880 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4881 error "(1) Fail to create striped directory"
4882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4884 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4885 $START_NAMESPACE -r -A ||
4886 error "(2) Fail to start LFSCK for namespace"
4888 wait_all_targets_blocked namespace completed 3
4890 local repaired=$($SHOW_NAMESPACE |
4891 awk '/^striped_dirs_repaired/ { print $2 }')
4892 [ $repaired -eq 1 ] ||
4893 error "(4) Fail to re-generate master LMV EA: $repaired"
4895 umount_client $MOUNT || error "(5) umount failed"
4896 mount_client $MOUNT || error "(6) mount failed"
4898 local empty=$(ls $DIR/$tdir/striped_dir/)
4899 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4901 rmdir $DIR/$tdir/striped_dir ||
4902 error "(8) Fail to remove the striped directory after LFSCK"
4904 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4907 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4910 echo "For some reason, the master MDT-object of the striped directory"
4911 echo "may lost its master LMV EA. If somebody created files under the"
4912 echo "master directly after the master LMV EA lost, then the LFSCK"
4913 echo "should NOT re-generate the master LMV EA, instead, it should"
4914 echo "change the broken striped dirctory as read-only to prevent"
4915 echo "further damage"
4918 check_mount_and_prep
4920 echo "Inject failure stub on MDT0 to simulate the case that the"
4921 echo "master MDT-object of the striped directory lost the LMV EA."
4923 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4925 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4926 error "(1) Fail to create striped directory"
4927 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4929 umount_client $MOUNT || error "(2) umount failed"
4930 mount_client $MOUNT || error "(3) mount failed"
4932 touch $DIR/$tdir/striped_dir/dummy ||
4933 error "(4) Fail to touch under broken striped directory"
4935 echo "Trigger namespace LFSCK to find out the inconsistency"
4936 $START_NAMESPACE -r -A ||
4937 error "(5) Fail to start LFSCK for namespace"
4939 wait_all_targets_blocked namespace completed 6
4941 local repaired=$($SHOW_NAMESPACE |
4942 awk '/^striped_dirs_repaired/ { print $2 }')
4943 [ $repaired -eq 0 ] ||
4944 error "(7) Re-generate master LMV EA unexpected: $repaired"
4946 stat $DIR/$tdir/striped_dir/dummy ||
4947 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4949 touch $DIR/$tdir/striped_dir/foo &&
4950 error "(9) The broken striped directory should be read-only"
4952 chattr -i $DIR/$tdir/striped_dir ||
4953 error "(10) Fail to chattr on the broken striped directory"
4955 rmdir $DIR/$tdir/striped_dir ||
4956 error "(11) Fail to remove the striped directory after LFSCK"
4958 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4961 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4964 echo "For some reason, the slave MDT-object of the striped directory"
4965 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4966 echo "slave LMV EA."
4969 check_mount_and_prep
4971 echo "Inject failure stub on MDT0 to simulate the case that the"
4972 echo "slave MDT-object (that resides on the same MDT as the master"
4973 echo "MDT-object resides on) lost the LMV EA."
4975 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4976 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4977 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4978 error "(1) Fail to create striped directory"
4979 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4981 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4982 $START_NAMESPACE -r -A ||
4983 error "(2) Fail to start LFSCK for namespace"
4985 wait_all_targets_blocked namespace completed 3
4987 local repaired=$($SHOW_NAMESPACE |
4988 awk '/^striped_shards_repaired/ { print $2 }')
4989 [ $repaired -eq 1 ] ||
4990 error "(4) Fail to re-generate slave LMV EA: $repaired"
4992 rmdir $DIR/$tdir/striped_dir ||
4993 error "(5) Fail to remove the striped directory after LFSCK"
4995 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4998 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5001 echo "For some reason, the slave MDT-object of the striped directory"
5002 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5003 echo "slave LMV EA."
5006 check_mount_and_prep
5008 echo "Inject failure stub on MDT0 to simulate the case that the"
5009 echo "slave MDT-object (that resides on different MDT as the master"
5010 echo "MDT-object resides on) lost the LMV EA."
5012 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5013 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5014 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5015 error "(1) Fail to create striped directory"
5016 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5018 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5019 $START_NAMESPACE -r -A ||
5020 error "(2) Fail to start LFSCK for namespace"
5022 wait_all_targets_blocked namespace completed 3
5024 local repaired=$(do_facet mds2 $LCTL get_param -n \
5025 mdd.$(facet_svc mds2).lfsck_namespace |
5026 awk '/^striped_shards_repaired/ { print $2 }')
5027 [ $repaired -eq 1 ] ||
5028 error "(4) Fail to re-generate slave LMV EA: $repaired"
5030 rmdir $DIR/$tdir/striped_dir ||
5031 error "(5) Fail to remove the striped directory after LFSCK"
5033 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5036 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5039 echo "For some reason, the stripe index in the slave LMV EA is"
5040 echo "corrupted. The LFSCK should repair the slave LMV EA."
5043 check_mount_and_prep
5045 echo "Inject failure stub on MDT0 to simulate the case that the"
5046 echo "slave LMV EA on the first shard of the striped directory"
5047 echo "claims the same index as the second shard claims"
5049 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5050 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5051 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5052 error "(1) Fail to create striped directory"
5053 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5055 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5056 $START_NAMESPACE -r -A ||
5057 error "(2) Fail to start LFSCK for namespace"
5059 wait_all_targets_blocked namespace completed 3
5061 local repaired=$($SHOW_NAMESPACE |
5062 awk '/^striped_shards_repaired/ { print $2 }')
5063 [ $repaired -eq 1 ] ||
5064 error "(4) Fail to repair slave LMV EA: $repaired"
5066 umount_client $MOUNT || error "(5) umount failed"
5067 mount_client $MOUNT || error "(6) mount failed"
5069 touch $DIR/$tdir/striped_dir/foo ||
5070 error "(7) Fail to touch file after the LFSCK"
5072 rm -f $DIR/$tdir/striped_dir/foo ||
5073 error "(8) Fail to unlink file after the LFSCK"
5075 rmdir $DIR/$tdir/striped_dir ||
5076 error "(9) Fail to remove the striped directory after LFSCK"
5078 run_test 31g "Repair the corrupted slave LMV EA"
5081 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5084 echo "For some reason, the shard's name entry in the striped"
5085 echo "directory may be corrupted. The LFSCK should repair the"
5086 echo "bad shard's name entry."
5089 check_mount_and_prep
5091 echo "Inject failure stub on MDT0 to simulate the case that the"
5092 echo "first shard's name entry in the striped directory claims"
5093 echo "the same index as the second shard's name entry claims."
5095 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5096 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5097 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5098 error "(1) Fail to create striped directory"
5099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5101 echo "Trigger namespace LFSCK to repair the shard's name entry"
5102 $START_NAMESPACE -r -A ||
5103 error "(2) Fail to start LFSCK for namespace"
5105 wait_all_targets_blocked namespace completed 3
5107 local repaired=$($SHOW_NAMESPACE |
5108 awk '/^dirent_repaired/ { print $2 }')
5109 [ $repaired -eq 1 ] ||
5110 error "(4) Fail to repair shard's name entry: $repaired"
5112 umount_client $MOUNT || error "(5) umount failed"
5113 mount_client $MOUNT || error "(6) mount failed"
5115 touch $DIR/$tdir/striped_dir/foo ||
5116 error "(7) Fail to touch file after the LFSCK"
5118 rm -f $DIR/$tdir/striped_dir/foo ||
5119 error "(8) Fail to unlink file after the LFSCK"
5121 rmdir $DIR/$tdir/striped_dir ||
5122 error "(9) Fail to remove the striped directory after LFSCK"
5124 run_test 31h "Repair the corrupted shard's name entry"
5129 umount_client $MOUNT
5131 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5132 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5133 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5135 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5136 [ "$STATUS" == "scanning-phase1" ] ||
5137 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5140 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5146 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5148 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5149 error "(5) Fail to start ost1"
5151 run_test 32a "stop LFSCK when some OST failed"
5155 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5158 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5159 error "(1) Fail to create $DIR/$tdir/dp"
5160 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5161 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5162 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5163 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5164 umount_client $MOUNT
5166 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5167 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5168 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5170 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5171 mdd.${MDT_DEV}.lfsck_namespace |
5172 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5174 error "(5) unexpected status"
5178 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5180 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5184 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5186 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5187 error "(8) Fail to start MDT2"
5189 run_test 32b "stop LFSCK when some MDT failed"
5195 $START_LAYOUT --dryrun -o -r ||
5196 error "(1) Fail to start layout LFSCK"
5197 wait_all_targets_blocked layout completed 2
5199 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5200 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5201 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5203 $START_NAMESPACE -e abort -A -r ||
5204 error "(4) Fail to start namespace LFSCK"
5205 wait_all_targets_blocked namespace completed 5
5207 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5208 [ "$PARAMS" == "failout,all_targets" ] ||
5209 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5211 run_test 33 "check LFSCK paramters"
5215 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5216 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5217 skip "Only valid for ZFS backend" && return
5221 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5223 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5224 error "(1) Fail to create $DIR/$tdir/dummy"
5226 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5227 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5229 mdd.${MDT_DEV}.lfsck_namespace |
5230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5232 error "(3) unexpected status"
5235 local repaired=$($SHOW_NAMESPACE |
5236 awk '/^dirent_repaired/ { print $2 }')
5237 [ $repaired -eq 1 ] ||
5238 error "(4) Fail to repair the lost agent object: $repaired"
5240 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5241 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5242 mdd.${MDT_DEV}.lfsck_namespace |
5243 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5245 error "(6) unexpected status"
5248 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5249 [ $repaired -eq 0 ] ||
5250 error "(7) Unexpected repairing: $repaired"
5252 run_test 34 "LFSCK can rebuild the lost agent object"
5256 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5260 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5261 do_facet mds2 $LCTL set_param fail_loc=0x1631
5262 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5263 error "(1) Fail to create $DIR/$tdir/dummy"
5266 do_facet mds2 $LCTL set_param fail_loc=0
5267 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5268 wait_update_facet mds2 "$LCTL get_param -n \
5269 mdd.$(facet_svc mds2).lfsck_namespace |
5270 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5271 error "(3) MDS${k} is not the expected 'completed'"
5273 local repaired=$(do_facet mds2 $LCTL get_param -n \
5274 mdd.$(facet_svc mds2).lfsck_namespace |
5275 awk '/^agent_entries_repaired/ { print $2 }')
5276 [ $repaired -eq 1 ] ||
5277 error "(4) Fail to repair the lost agent entry: $repaired"
5279 echo "stopall to cleanup object cache"
5282 setupall > /dev/null
5284 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5285 wait_update_facet mds2 "$LCTL get_param -n \
5286 mdd.$(facet_svc mds2).lfsck_namespace |
5287 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5288 error "(6) MDS${k} is not the expected 'completed'"
5290 repaired=$(do_facet mds2 $LCTL get_param -n \
5291 mdd.$(facet_svc mds2).lfsck_namespace |
5292 awk '/^agent_entries_repaired/ { print $2 }')
5293 [ $repaired -eq 0 ] ||
5294 error "(7) Unexpected repairing: $repaired"
5296 run_test 35 "LFSCK can rebuild the lost agent entry"
5299 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5302 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5303 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5304 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5307 check_mount_and_prep
5311 lctl get_param osc.*.*grant*
5312 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5314 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5315 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5316 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5317 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5318 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5319 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5320 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5321 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5322 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5324 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5325 error "(3) Fail to write $DIR/$tdir/f0"
5326 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5327 error "(4) Fail to write $DIR/$tdir/f1"
5328 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5329 error "(5) Fail to write $DIR/$tdir/f2"
5331 $LFS mirror resync $DIR/$tdir/f0 ||
5332 error "(6) Fail to resync $DIR/$tdir/f0"
5333 $LFS mirror resync $DIR/$tdir/f1 ||
5334 error "(7) Fail to resync $DIR/$tdir/f1"
5335 $LFS mirror resync $DIR/$tdir/f2 ||
5336 error "(8) Fail to resync $DIR/$tdir/f2"
5338 cancel_lru_locks mdc
5339 cancel_lru_locks osc
5341 $LFS getstripe $DIR/$tdir/f0 ||
5342 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5343 $LFS getstripe $DIR/$tdir/f1 ||
5344 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5345 $LFS getstripe $DIR/$tdir/f2 ||
5346 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5348 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5349 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5350 do_facet mds1 $LCTL set_param fail_loc=0x1616
5352 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5353 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5354 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5355 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5356 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5357 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5361 do_facet mds1 $LCTL set_param fail_loc=0
5363 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5364 error "(15) The 1st of mirror is not destroyed"
5365 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5366 error "(16) The 2nd of mirror is not destroyed"
5367 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5368 error "(17) The 3rd of mirror is not destroyed"
5372 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5373 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5374 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5375 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5376 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5377 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5379 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5380 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5382 for k in $(seq $MDSCOUNT); do
5383 # The LFSCK status query internal is 30 seconds. For the case
5384 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5385 # time to guarantee the status sync up.
5386 wait_update_facet mds${k} "$LCTL get_param -n \
5387 mdd.$(facet_svc mds${k}).lfsck_layout |
5388 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5389 error "(22) MDS${k} is not the expected 'completed'"
5392 for k in $(seq $OSTCOUNT); do
5393 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5394 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5395 awk '/^status/ { print $2 }')
5396 [ "$cur_status" == "completed" ] ||
5397 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5400 local repaired=$(do_facet mds1 $LCTL get_param -n \
5401 mdd.$(facet_svc mds1).lfsck_layout |
5402 awk '/^repaired_orphan/ { print $2 }')
5403 [ $repaired -eq 9 ] ||
5404 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5406 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5407 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5408 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5409 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5410 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5411 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5413 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5414 $LFS getstripe $DIR/$tdir/f0
5415 error "(28) The 1st of mirror is not recovered"
5418 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5419 $LFS getstripe $DIR/$tdir/f1
5420 error "(29) The 2nd of mirror is not recovered"
5423 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5424 $LFS getstripe $DIR/$tdir/f2
5425 error "(30) The 3rd of mirror is not recovered"
5428 run_test 36a "rebuild LOV EA for mirrored file (1)"
5431 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5432 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5435 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5436 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5437 echo "with the PFID EA of related OST-object(s) belong to the file. "
5440 check_mount_and_prep
5442 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5443 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5444 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5446 local fid=$($LFS path2fid $DIR/$tdir/f0)
5448 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5449 error "(1) Fail to write $DIR/$tdir/f0"
5450 $LFS mirror resync $DIR/$tdir/f0 ||
5451 error "(2) Fail to resync $DIR/$tdir/f0"
5453 cancel_lru_locks mdc
5454 cancel_lru_locks osc
5456 $LFS getstripe $DIR/$tdir/f0 ||
5457 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5459 echo "Inject failure, to simulate the case of missing the MDT-object"
5460 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5461 do_facet mds1 $LCTL set_param fail_loc=0x1616
5462 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5466 do_facet mds1 $LCTL set_param fail_loc=0
5468 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5469 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5471 for k in $(seq $MDSCOUNT); do
5472 # The LFSCK status query internal is 30 seconds. For the case
5473 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5474 # time to guarantee the status sync up.
5475 wait_update_facet mds${k} "$LCTL get_param -n \
5476 mdd.$(facet_svc mds${k}).lfsck_layout |
5477 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5478 error "(6) MDS${k} is not the expected 'completed'"
5481 for k in $(seq $OSTCOUNT); do
5482 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5483 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5484 awk '/^status/ { print $2 }')
5485 [ "$cur_status" == "completed" ] ||
5486 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5489 local count=$(do_facet mds1 $LCTL get_param -n \
5490 mdd.$(facet_svc mds1).lfsck_layout |
5491 awk '/^repaired_orphan/ { print $2 }')
5492 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5494 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5495 count=$($LFS getstripe --mirror-count $name)
5496 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5498 count=$($LFS getstripe --component-count $name)
5499 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5501 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5502 $LFS getstripe $name
5503 error "(11) The 1st of mirror is not recovered"
5506 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5507 $LFS getstripe $name
5508 error "(12) The 2nd of mirror is not recovered"
5511 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5512 $LFS getstripe $name
5513 error "(13) The 3rd of mirror is not recovered"
5516 run_test 36b "rebuild LOV EA for mirrored file (2)"
5519 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5520 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5523 echo "The mirrored file has been modified, not resynced yet, then "
5524 echo "lost its MDT-object, but relatd OST-objects are still there. "
5525 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5526 echo "with the PFID EA of related OST-object(s) belong to the file. "
5529 check_mount_and_prep
5531 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5533 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5535 local fid=$($LFS path2fid $DIR/$tdir/f0)
5537 # The 1st dd && resync makes all related OST-objects have been written
5538 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5539 error "(1.1) Fail to write $DIR/$tdir/f0"
5540 $LFS mirror resync $DIR/$tdir/f0 ||
5541 error "(1.2) Fail to resync $DIR/$tdir/f0"
5542 # The 2nd dd makes one mirror to be stale
5543 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5544 error "(1.3) Fail to write $DIR/$tdir/f0"
5546 cancel_lru_locks mdc
5547 cancel_lru_locks osc
5549 $LFS getstripe $DIR/$tdir/f0 ||
5550 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5552 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5553 awk '/lcme_flags/ { print $2 }')
5554 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5555 awk '/lcme_flags/ { print $2 }')
5557 echo "Inject failure, to simulate the case of missing the MDT-object"
5558 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5559 do_facet mds1 $LCTL set_param fail_loc=0x1616
5560 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5564 do_facet mds1 $LCTL set_param fail_loc=0
5566 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5567 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5569 for k in $(seq $MDSCOUNT); do
5570 # The LFSCK status query internal is 30 seconds. For the case
5571 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5572 # time to guarantee the status sync up.
5573 wait_update_facet mds${k} "$LCTL get_param -n \
5574 mdd.$(facet_svc mds${k}).lfsck_layout |
5575 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5576 error "(5) MDS${k} is not the expected 'completed'"
5579 for k in $(seq $OSTCOUNT); do
5580 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5581 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5582 awk '/^status/ { print $2 }')
5583 [ "$cur_status" == "completed" ] ||
5584 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5587 local count=$(do_facet mds1 $LCTL get_param -n \
5588 mdd.$(facet_svc mds1).lfsck_layout |
5589 awk '/^repaired_orphan/ { print $2 }')
5590 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5592 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5593 count=$($LFS getstripe --mirror-count $name)
5594 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5596 count=$($LFS getstripe --component-count $name)
5597 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5599 local flags=$($LFS getstripe $name | head -n 10 |
5600 awk '/lcme_flags/ { print $2 }')
5601 [ "$flags" == "$saved_flags1" ] || {
5602 $LFS getstripe $name
5603 error "(10) expect flags $saved_flags1, got $flags"
5606 flags=$($LFS getstripe $name | tail -n 10 |
5607 awk '/lcme_flags/ { print $2 }')
5608 [ "$flags" == "$saved_flags2" ] || {
5609 $LFS getstripe $name
5610 error "(11) expect flags $saved_flags2, got $flags"
5613 run_test 36c "rebuild LOV EA for mirrored file (3)"
5619 local t_dir="$DIR/$tdir/d0"
5620 check_mount_and_prep
5622 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5623 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5627 $START_NAMESPACE -r -A || {
5628 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5630 wait_all_targets_blocked namespace completed 4
5635 run_test 37 "LFSCK must skip a ORPHAN"
5639 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5640 skip "Need MDS version newer than 2.12.51"
5642 test_mkdir $DIR/$tdir
5643 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5644 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5646 # create foreign file
5647 $LFS setstripe --foreign=daos --flags 0xda05 \
5648 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5649 error "$DIR/$tdir/$tfile: create failed"
5651 $LFS getstripe -v $DIR/$tdir/$tfile |
5652 grep "lfm_magic:.*0x0BD70BD0" ||
5653 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5654 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5655 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5656 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5657 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5658 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5659 $LFS getstripe -v $DIR/$tdir/$tfile |
5660 grep "lfm_flags:.*0x0000DA05" ||
5661 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5662 $LFS getstripe $DIR/$tdir/$tfile |
5663 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5664 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5666 # modify striping should fail
5667 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5668 error "$DIR/$tdir/$tfile: setstripe should fail"
5670 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5672 wait_all_targets_blocked namespace completed 1
5674 # check that "global" namespace_repaired == 0 !!!
5675 local repaired=$(do_facet mds1 \
5676 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5677 awk '/^namespace_repaired/ { print \\\$2 }'")
5678 [ $repaired -eq 0 ] ||
5679 error "(2) Expect no namespace repair, but got: $repaired"
5681 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5683 wait_all_targets_blocked layout completed 2
5685 # check that "global" layout_repaired == 0 !!!
5686 local repaired=$(do_facet mds1 \
5687 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5688 awk '/^layout_repaired/ { print \\\$2 }'")
5689 [ $repaired -eq 0 ] ||
5690 error "(2) Expect no layout repair, but got: $repaired"
5692 echo "post-lfsck checks of foreign file"
5694 $LFS getstripe -v $DIR/$tdir/$tfile |
5695 grep "lfm_magic:.*0x0BD70BD0" ||
5696 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5697 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5698 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5699 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5700 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5701 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5702 $LFS getstripe -v $DIR/$tdir/$tfile |
5703 grep "lfm_flags:.*0x0000DA05" ||
5704 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5705 $LFS getstripe $DIR/$tdir/$tfile |
5706 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5707 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5709 # modify striping should fail
5710 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5711 error "$DIR/$tdir/$tfile: setstripe should fail"
5714 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5715 cat /etc/passwd > $DIR/$tdir/$tfile &&
5716 error "$DIR/$tdir/$tfile: write should fail"
5718 #remove foreign file
5719 rm $DIR/$tdir/$tfile ||
5720 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5722 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5726 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5727 skip "Need MDS version newer than 2.12.51"
5729 test_mkdir $DIR/$tdir
5730 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5731 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5733 # create foreign dir
5734 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5735 $DIR/$tdir/${tdir}2 ||
5736 error "$DIR/$tdir/${tdir}2: create failed"
5738 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5739 grep "lfm_magic:.*0x0CD50CD0" ||
5740 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5741 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5742 # - sizeof(lfm_type) - sizeof(lfm_flags)
5743 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5744 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5745 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5746 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5747 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5748 grep "lfm_flags:.*0x0000DA05" ||
5749 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5750 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5751 grep "lfm_value.*${uuid1}@${uuid2}" ||
5752 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5754 # file create in dir should fail
5755 touch $DIR/$tdir/${tdir}2/$tfile &&
5756 "$DIR/${tdir}2: file create should fail"
5759 chmod 777 $DIR/$tdir/${tdir}2 ||
5760 error "$DIR/${tdir}2: chmod failed"
5763 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5764 error "$DIR/${tdir}2: chown failed"
5766 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5768 wait_all_targets_blocked namespace completed 1
5770 # check that "global" namespace_repaired == 0 !!!
5771 local repaired=$(do_facet mds1 \
5772 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5773 awk '/^namespace_repaired/ { print \\\$2 }'")
5774 [ $repaired -eq 0 ] ||
5775 error "(2) Expect nothing to be repaired, but got: $repaired"
5777 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5779 wait_all_targets_blocked layout completed 2
5781 # check that "global" layout_repaired == 0 !!!
5782 local repaired=$(do_facet mds1 \
5783 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5784 awk '/^layout_repaired/ { print \\\$2 }'")
5785 [ $repaired -eq 0 ] ||
5786 error "(2) Expect no layout repair, but got: $repaired"
5788 echo "post-lfsck checks of foreign dir"
5790 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5791 grep "lfm_magic:.*0x0CD50CD0" ||
5792 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5793 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5794 # - sizeof(lfm_type) - sizeof(lfm_flags)
5795 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5796 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5797 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5798 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5799 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5800 grep "lfm_flags:.*0x0000DA05" ||
5801 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5802 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5803 grep "lfm_value.*${uuid1}@${uuid2}" ||
5804 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5806 # file create in dir should fail
5807 touch $DIR/$tdir/${tdir}2/$tfile &&
5808 "$DIR/${tdir}2: file create should fail"
5811 chmod 777 $DIR/$tdir/${tdir}2 ||
5812 error "$DIR/${tdir}2: chmod failed"
5815 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5816 error "$DIR/${tdir}2: chown failed"
5819 rmdir $DIR/$tdir/${tdir}2 ||
5820 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5822 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5825 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5827 check_mount_and_prep
5828 $LFS mkdir -i 1 $DIR/$tdir/dir1
5829 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5831 touch $DIR/$tdir/dir1/f1
5832 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5834 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5835 $LFS migrate -m 0 $DIR/$tdir/dir1
5837 echo "trigger LFSCK for layout"
5838 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5840 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5841 mdd.${MDT_DEV}.lfsck_layout |
5842 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5844 error "(2) unexpected status"
5847 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5849 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5851 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5853 # restore MDS/OST size
5854 MDSSIZE=${SAVED_MDSSIZE}
5855 OSTSIZE=${SAVED_OSTSIZE}
5856 OSTCOUNT=${SAVED_OSTCOUNT}
5858 # cleanup the system at last
5859 REFORMAT="yes" cleanup_and_setup_lustre
5862 check_and_cleanup_lustre