3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # DNE does not support striped directory on zfs-based backend yet.
19 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
21 # bug number for skipped test: LU-5855 LU-5855 LU-5855 LU-5855
22 ALWAYS_EXCEPT+=" 31a 31b 31c 31d"
23 # bug number for skipped test: LU-5855 LU-5855 LU-5855 LU-5855
24 ALWAYS_EXCEPT+=" 31e 31f 31g 31h"
25 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
27 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
28 # bug number for skipped test: LU-4165
31 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
32 # bug number for skipped test: LU-1267
34 # bug number for skipped test: LU-3950
36 # bug number for skipped test: LU-3593
38 # bug number for skipped test: LU-3590
40 # bug number for skipped test: LU-3591
42 # bug number for skipped test: LU-3594 LU-3594
43 ALWAYS_EXCEPT+=" 16 17"
44 # bug number for skipped test: LU-3336 LU-3336 LU-3336 LU-3336 LU-3336
45 ALWAYS_EXCEPT+=" 18a 18b 18c 18d 18e"
46 # bug number for skipped test: LU-3951 LU-3951
47 ALWAYS_EXCEPT+=" 19a 19b"
48 # bug number for skipped test: LU-4887 LU-4887
49 ALWAYS_EXCEPT+=" 20 21"
51 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
52 # bug number for skipped test: LU-4788
54 # bug number for skipped test: LU-5511 LU-5511 LU-5511
55 ALWAYS_EXCEPT+=" 2e 22a 22b"
56 # bug number for skipped test: LU-4788
58 # bug number for skipped test: LU-5512 LU-5512 LU-5512
59 ALWAYS_EXCEPT+=" 23a 23b 23c"
60 # bug number for skipped test: LU-5513
62 # bug number for skipped test: LU-5515
64 # bug number for skipped test: LU-5516 LU-5516
65 ALWAYS_EXCEPT+=" 26a 26b"
66 # bug number for skipped test: LU-5516 LU-5516
67 ALWAYS_EXCEPT+=" 27a 27b"
68 # bug number for skipped test: LU-5506
70 # bug number for skipped test: LU-5517 LU-5517 LU-5517
71 ALWAYS_EXCEPT+=" 29a 29b 29c"
72 # bug number for skipped test: LU-5518
74 # bug number for skipped test: LU-5519 LU-5519 LU-5519 LU-5519
75 ALWAYS_EXCEPT+=" 31a 31b 31c 31d"
76 # bug number for skipped test: LU-5519 LU-5519 LU-5519 LU-5519
77 ALWAYS_EXCEPT+=" 31e 31f 31g 31h"
79 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
82 require_dsh_mds || exit 0
86 if ! check_versions; then
87 skip "It is NOT necessary to test lfsck under interoperation mode"
91 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
92 skip "Need MDS version at least 2.3.60" && exit 0
96 SAVED_MDSSIZE=${MDSSIZE}
97 SAVED_OSTSIZE=${OSTSIZE}
98 SAVED_OSTCOUNT=${OSTCOUNT}
99 # use small MDS + OST size to speed formatting time
100 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
102 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
104 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
106 # no need too many OSTs, to reduce the format/start/stop overhead
108 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
110 # build up a clean test environment.
111 REFORMAT="yes" check_and_setup_lustre
113 MDT_DEV="${FSNAME}-MDT0000"
114 OST_DEV="${FSNAME}-OST0000"
115 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
116 START_NAMESPACE="do_facet $SINGLEMDS \
117 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
118 START_LAYOUT="do_facet $SINGLEMDS \
119 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
120 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
121 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
122 SHOW_NAMESPACE="do_facet $SINGLEMDS \
123 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
124 SHOW_LAYOUT="do_facet $SINGLEMDS \
125 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
126 SHOW_LAYOUT_ON_OST="do_facet ost1 \
127 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
128 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
129 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
130 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
139 echo "preparing... $nfiles * $ndirs files will be created $(date)."
140 if [ ! -z $igif ]; then
141 #define OBD_FAIL_FID_IGIF 0x1504
142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
145 cp $LUSTRE/tests/*.sh $DIR/$tdir/
146 if [ $ndirs -gt 0 ]; then
147 createmany -d $DIR/$tdir/d $ndirs
148 createmany -m $DIR/$tdir/f $ndirs
149 if [ $nfiles -gt 0 ]; then
150 for ((i = 0; i < $ndirs; i++)); do
151 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
152 /dev/null || error "createmany $nfiles"
155 createmany -d $DIR/$tdir/e $ndirs
158 if [ ! -z $igif ]; then
159 touch $DIR/$tdir/dummy
160 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
163 echo "prepared $(date)."
166 run_e2fsck_on_mdt0() {
167 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
169 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
170 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
172 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
173 error "(2) Detected inconsistency on MDT0"
175 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
176 error "(3) Fail to start MDT0"
179 wait_all_targets_blocked() {
184 local count=$(do_facet mds1 \
185 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
186 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
187 [[ $count -eq $MDSCOUNT ]] || {
188 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
189 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
198 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
199 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
200 "$MDSCOUNT" $LTIME || {
201 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
202 error "($err) some MDTs are not in ${status}"
209 #define OBD_FAIL_LFSCK_DELAY1 0x1600
210 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
211 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
213 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
215 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
216 [ "$STATUS" == "scanning-phase1" ] ||
217 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
219 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
221 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
222 [ "$STATUS" == "stopped" ] ||
223 error "(6) Expect 'stopped', but got '$STATUS'"
225 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
227 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
228 [ "$STATUS" == "scanning-phase1" ] ||
229 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
232 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
233 mdd.${MDT_DEV}.lfsck_namespace |
234 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
236 error "(9) unexpected status"
239 local repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
241 [ $repaired -eq 0 ] ||
242 error "(10) Expect nothing to be repaired, but got: $repaired"
244 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
245 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
246 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
247 mdd.${MDT_DEV}.lfsck_namespace |
248 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
250 error "(12) unexpected status"
253 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
254 [ $((scanned1 + 1)) -eq $scanned2 ] ||
255 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
257 echo "stopall, should NOT crash LU-3649"
258 stopall || error "(14) Fail to stopall"
260 run_test 0 "Control LFSCK manually"
265 #define OBD_FAIL_FID_INDIR 0x1501
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
267 touch $DIR/$tdir/dummy
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
271 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
272 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
273 mdd.${MDT_DEV}.lfsck_namespace |
274 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
276 error "(4) unexpected status"
279 local repaired=$($SHOW_NAMESPACE |
280 awk '/^dirent_repaired/ { print $2 }')
281 # for interop with old server
282 [ -z "$repaired" ] &&
283 repaired=$($SHOW_NAMESPACE |
284 awk '/^updated_phase1/ { print $2 }')
286 [ $repaired -eq 1 ] ||
287 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
303 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
304 skip "OI Scrub not implemented for ZFS" && return
308 #define OBD_FAIL_FID_INLMA 0x1502
309 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
310 touch $DIR/$tdir/dummy
312 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
314 #define OBD_FAIL_FID_NOLMA 0x1506
315 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
316 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
317 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
318 mdd.${MDT_DEV}.lfsck_namespace |
319 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
321 error "(4) unexpected status"
324 local repaired=$($SHOW_NAMESPACE |
325 awk '/^dirent_repaired/ { print $2 }')
326 # for interop with old server
327 [ -z "$repaired" ] &&
328 repaired=$($SHOW_NAMESPACE |
329 awk '/^updated_phase1/ { print $2 }')
331 [ $repaired -eq 1 ] ||
332 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
337 mount_client $MOUNT || error "(6) Fail to start client!"
339 #define OBD_FAIL_FID_LOOKUP 0x1505
340 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
341 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
343 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
345 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
350 #define OBD_FAIL_FID_IGIF 0x1504
351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
352 touch $DIR/$tdir/dummy
354 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
356 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
357 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
358 mdd.${MDT_DEV}.lfsck_namespace |
359 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
361 error "(4) unexpected status"
364 local repaired=$($SHOW_NAMESPACE |
365 awk '/^dirent_repaired/ { print $2 }')
366 # for interop with old server
367 [ -z "$repaired" ] &&
368 repaired=$($SHOW_NAMESPACE |
369 awk '/^updated_phase1/ { print $2 }')
371 [ $repaired -eq 1 ] ||
372 error "(5) Fail to repair lost FID-in-dirent: $repaired"
376 mount_client $MOUNT || error "(6) Fail to start client!"
378 #define OBD_FAIL_FID_LOOKUP 0x1505
379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
380 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
382 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
384 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
389 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
390 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
391 touch $DIR/$tdir/dummy
393 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
395 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
396 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
397 mdd.${MDT_DEV}.lfsck_namespace |
398 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
400 error "(4) unexpected status"
403 local repaired=$($SHOW_NAMESPACE |
404 awk '/^linkea_repaired/ { print $2 }')
405 # for interop with old server
406 [ -z "$repaired" ] &&
407 repaired=$($SHOW_NAMESPACE |
408 awk '/^updated_phase2/ { print $2 }')
410 [ $repaired -eq 1 ] ||
411 error "(5) Fail to repair crashed linkEA: $repaired"
415 mount_client $MOUNT || error "(6) Fail to start client!"
417 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
418 error "(7) Fail to stat $DIR/$tdir/dummy"
420 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
421 local dummyname=$($LFS fid2path $DIR $dummyfid)
422 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
423 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
425 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
431 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
432 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
433 touch $DIR/$tdir/dummy
435 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
437 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
438 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
439 mdd.${MDT_DEV}.lfsck_namespace |
440 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
442 error "(4) unexpected status"
445 local repaired=$($SHOW_NAMESPACE |
446 awk '/^updated_phase2/ { print $2 }')
447 [ $repaired -eq 1 ] ||
448 error "(5) Fail to repair crashed linkEA: $repaired"
452 mount_client $MOUNT || error "(6) Fail to start client!"
454 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
455 error "(7) Fail to stat $DIR/$tdir/dummy"
457 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
458 local dummyname=$($LFS fid2path $DIR $dummyfid)
459 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
460 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
462 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
468 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
469 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
470 touch $DIR/$tdir/dummy
472 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
474 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
475 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
476 mdd.${MDT_DEV}.lfsck_namespace |
477 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
479 error "(4) unexpected status"
482 local repaired=$($SHOW_NAMESPACE |
483 awk '/^updated_phase2/ { print $2 }')
484 [ $repaired -eq 1 ] ||
485 error "(5) Fail to repair crashed linkEA: $repaired"
489 mount_client $MOUNT || error "(6) Fail to start client!"
491 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
492 error "(7) Fail to stat $DIR/$tdir/dummy"
494 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
495 local dummyname=$($LFS fid2path $DIR $dummyfid)
496 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
497 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
499 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
505 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
506 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
507 touch $DIR/$tdir/dummy
509 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
511 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
512 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
513 mdd.${MDT_DEV}.lfsck_namespace |
514 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
516 error "(4) unexpected status"
519 local repaired=$($SHOW_NAMESPACE |
520 awk '/^linkea_repaired/ { print $2 }')
521 [ $repaired -eq 1 ] ||
522 error "(5) Fail to repair crashed linkEA: $repaired"
526 mount_client $MOUNT || error "(6) Fail to start client!"
528 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
529 error "(7) Fail to stat $DIR/$tdir/dummy"
531 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
532 local dummyname=$($LFS fid2path $DIR $dummyfid)
533 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
534 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
536 run_test 2d "LFSCK can recover the missing linkEA entry"
540 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
544 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
546 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
547 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
548 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
549 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
551 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
553 wait_all_targets_blocked namespace completed 4
555 local repaired=$($SHOW_NAMESPACE |
556 awk '/^linkea_repaired/ { print $2 }')
557 [ $repaired -eq 1 ] ||
558 error "(5) Fail to repair crashed linkEA: $repaired"
560 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
561 local name=$($LFS fid2path $DIR $fid)
562 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
563 error "(6) Fail to repair linkEA: $fid $name"
565 run_test 2e "namespace LFSCK can verify remote object linkEA"
571 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
572 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
573 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
575 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
576 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
577 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
579 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
581 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
583 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
585 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
589 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
590 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
591 mdd.${MDT_DEV}.lfsck_namespace |
592 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
594 error "(10) unexpected status"
597 local checked=$($SHOW_NAMESPACE |
598 awk '/^checked_phase2/ { print $2 }')
599 [ $checked -ge 4 ] ||
600 error "(11) Fail to check multiple-linked object: $checked"
602 local repaired=$($SHOW_NAMESPACE |
603 awk '/^multiple_linked_repaired/ { print $2 }')
604 [ $repaired -ge 2 ] ||
605 error "(12) Fail to repair multiple-linked object: $repaired"
607 run_test 3 "LFSCK can verify multiple-linked objects"
611 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
612 skip "OI Scrub not implemented for ZFS" && return
615 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
616 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
618 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
619 echo "start $SINGLEMDS with disabling OI scrub"
620 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
621 error "(2) Fail to start MDS!"
623 #define OBD_FAIL_LFSCK_DELAY2 0x1601
624 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
625 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
626 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
627 mdd.${MDT_DEV}.lfsck_namespace |
628 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
630 error "(5) unexpected status"
633 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
634 [ "$STATUS" == "scanning-phase1" ] ||
635 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
637 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
638 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
639 mdd.${MDT_DEV}.lfsck_namespace |
640 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
642 error "(7) unexpected status"
645 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
646 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
648 local repaired=$($SHOW_NAMESPACE |
649 awk '/^dirent_repaired/ { print $2 }')
650 # for interop with old server
651 [ -z "$repaired" ] &&
652 repaired=$($SHOW_NAMESPACE |
653 awk '/^updated_phase1/ { print $2 }')
655 [ $repaired -ge 9 ] ||
656 error "(9) Fail to re-generate FID-in-dirent: $repaired"
660 mount_client $MOUNT || error "(10) Fail to start client!"
662 #define OBD_FAIL_FID_LOOKUP 0x1505
663 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
664 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
667 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
671 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
672 skip "OI Scrub not implemented for ZFS" && return
675 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
676 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
678 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
679 echo "start $SINGLEMDS with disabling OI scrub"
680 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
681 error "(2) Fail to start MDS!"
683 #define OBD_FAIL_LFSCK_DELAY2 0x1601
684 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
685 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
686 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
687 mdd.${MDT_DEV}.lfsck_namespace |
688 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
690 error "(5) unexpected status"
693 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
694 [ "$STATUS" == "scanning-phase1" ] ||
695 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
698 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
699 mdd.${MDT_DEV}.lfsck_namespace |
700 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
702 error "(7) unexpected status"
705 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
706 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
708 local repaired=$($SHOW_NAMESPACE |
709 awk '/^dirent_repaired/ { print $2 }')
710 # for interop with old server
711 [ -z "$repaired" ] &&
712 repaired=$($SHOW_NAMESPACE |
713 awk '/^updated_phase1/ { print $2 }')
715 [ $repaired -ge 2 ] ||
716 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
720 mount_client $MOUNT || error "(10) Fail to start client!"
722 #define OBD_FAIL_FID_LOOKUP 0x1505
723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
724 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
726 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
728 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
729 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
730 local dummyname=$($LFS fid2path $DIR $dummyfid)
731 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
732 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
734 run_test 5 "LFSCK can handle IGIF object upgrading"
739 #define OBD_FAIL_LFSCK_DELAY1 0x1600
740 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
741 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
743 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
744 [ "$STATUS" == "scanning-phase1" ] ||
745 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
747 # Sleep 3 sec to guarantee at least one object processed by LFSCK
749 # Fail the LFSCK to guarantee there is at least one checkpoint
750 #define OBD_FAIL_LFSCK_FATAL1 0x1608
751 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
752 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
753 mdd.${MDT_DEV}.lfsck_namespace |
754 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
756 error "(4) unexpected status"
759 local POS0=$($SHOW_NAMESPACE |
760 awk '/^last_checkpoint_position/ { print $2 }' |
763 #define OBD_FAIL_LFSCK_DELAY1 0x1600
764 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
765 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
767 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
768 [ "$STATUS" == "scanning-phase1" ] ||
769 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
771 local POS1=$($SHOW_NAMESPACE |
772 awk '/^latest_start_position/ { print $2 }' |
774 [[ $POS0 -lt $POS1 ]] ||
775 error "(7) Expect larger than: $POS0, but got $POS1"
777 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
778 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
779 mdd.${MDT_DEV}.lfsck_namespace |
780 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
782 error "(8) unexpected status"
785 run_test 6a "LFSCK resumes from last checkpoint (1)"
790 #define OBD_FAIL_LFSCK_DELAY2 0x1601
791 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
792 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
794 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
798 # Sleep 5 sec to guarantee that we are in the directory scanning
800 # Fail the LFSCK to guarantee there is at least one checkpoint
801 #define OBD_FAIL_LFSCK_FATAL2 0x1609
802 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
803 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
804 mdd.${MDT_DEV}.lfsck_namespace |
805 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
807 error "(4) unexpected status"
810 local O_POS0=$($SHOW_NAMESPACE |
811 awk '/^last_checkpoint_position/ { print $2 }' |
814 local D_POS0=$($SHOW_NAMESPACE |
815 awk '/^last_checkpoint_position/ { print $4 }')
817 #define OBD_FAIL_LFSCK_DELAY2 0x1601
818 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
819 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
821 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
822 [ "$STATUS" == "scanning-phase1" ] ||
823 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
825 local O_POS1=$($SHOW_NAMESPACE |
826 awk '/^latest_start_position/ { print $2 }' |
828 local D_POS1=$($SHOW_NAMESPACE |
829 awk '/^latest_start_position/ { print $4 }')
831 echo "Additional debug for 6b"
833 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
834 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
835 [[ $O_POS0 -lt $O_POS1 ]] ||
836 error "(7.1) $O_POS1 is not larger than $O_POS0"
838 [[ $D_POS0 -lt $D_POS1 ]] ||
839 error "(7.2) $D_POS1 is not larger than $D_POS0"
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
843 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
844 mdd.${MDT_DEV}.lfsck_namespace |
845 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
847 error "(8) unexpected status"
850 run_test 6b "LFSCK resumes from last checkpoint (2)"
857 #define OBD_FAIL_LFSCK_DELAY2 0x1601
858 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
859 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
861 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
862 [ "$STATUS" == "scanning-phase1" ] ||
863 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
865 # Sleep 3 sec to guarantee at least one object processed by LFSCK
867 echo "stop $SINGLEMDS"
868 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
871 echo "start $SINGLEMDS"
872 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
873 error "(5) Fail to start MDS!"
875 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
876 mdd.${MDT_DEV}.lfsck_namespace |
877 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
879 error "(6) unexpected status"
882 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
888 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
890 for ((i = 0; i < 20; i++)); do
891 touch $DIR/$tdir/dummy${i}
894 #define OBD_FAIL_LFSCK_DELAY3 0x1602
895 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
896 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
897 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
898 mdd.${MDT_DEV}.lfsck_namespace |
899 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
901 error "(4) unexpected status"
905 echo "stop $SINGLEMDS"
906 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
908 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(6) Fail to start MDS!"
913 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
914 mdd.${MDT_DEV}.lfsck_namespace |
915 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
917 error "(7) unexpected status"
920 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
925 formatall > /dev/null
931 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
932 [ "$STATUS" == "init" ] ||
933 error "(2) Expect 'init', but got '$STATUS'"
935 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
936 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
937 mkdir $DIR/$tdir/crashed
939 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
940 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
941 for ((i = 0; i < 5; i++)); do
942 touch $DIR/$tdir/dummy${i}
945 umount_client $MOUNT || error "(3) Fail to stop client!"
947 #define OBD_FAIL_LFSCK_DELAY2 0x1601
948 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
949 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
951 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
952 [ "$STATUS" == "scanning-phase1" ] ||
953 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
955 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
957 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
958 [ "$STATUS" == "stopped" ] ||
959 error "(7) Expect 'stopped', but got '$STATUS'"
961 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "scanning-phase1" ] ||
965 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
967 #define OBD_FAIL_LFSCK_FATAL2 0x1609
968 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
969 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
970 mdd.${MDT_DEV}.lfsck_namespace |
971 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
973 error "(10) unexpected status"
976 #define OBD_FAIL_LFSCK_DELAY1 0x1600
977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
978 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
980 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
981 [ "$STATUS" == "scanning-phase1" ] ||
982 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
984 #define OBD_FAIL_LFSCK_CRASH 0x160a
985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
988 echo "stop $SINGLEMDS"
989 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
991 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
992 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
994 echo "start $SINGLEMDS"
995 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
996 error "(14) Fail to start MDS!"
998 local timeout=$(max_recovery_time)
1001 while [ $timer -lt $timeout ]; do
1002 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1003 mdt.${MDT_DEV}.recovery_status |
1004 awk '/^status/ { print \\\$2 }'")
1005 [ "$STATUS" != "RECOVERING" ] && break;
1007 timer=$((timer + 1))
1010 [ $timer != $timeout ] ||
1011 error "(14.1) recovery timeout"
1013 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1014 [ "$STATUS" == "crashed" ] ||
1015 error "(15) Expect 'crashed', but got '$STATUS'"
1017 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1018 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1019 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
1021 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1022 [ "$STATUS" == "scanning-phase1" ] ||
1023 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1025 echo "stop $SINGLEMDS"
1026 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1028 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1029 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1031 echo "start $SINGLEMDS"
1032 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1033 error "(19) Fail to start MDS!"
1036 while [ $timer -lt $timeout ]; do
1037 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1038 mdt.${MDT_DEV}.recovery_status |
1039 awk '/^status/ { print \\\$2 }'")
1040 [ "$STATUS" != "RECOVERING" ] && break;
1042 timer=$((timer + 1))
1045 [ $timer != $timeout ] ||
1046 error "(19.1) recovery timeout"
1048 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1049 [ "$STATUS" == "paused" ] ||
1050 error "(20) Expect 'paused', but got '$STATUS'"
1052 echo "stop $SINGLEMDS"
1053 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1055 echo "start $SINGLEMDS without resume LFSCK"
1056 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1057 error "(20.2) Fail to start MDS!"
1060 while [ $timer -lt $timeout ]; do
1061 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1062 mdt.${MDT_DEV}.recovery_status |
1063 awk '/^status/ { print \\\$2 }'")
1064 [ "$STATUS" != "RECOVERING" ] && break;
1066 timer=$((timer + 1))
1069 [ $timer != $timeout ] ||
1070 error "(20.3) recovery timeout"
1072 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1073 [ "$STATUS" == "paused" ] ||
1074 error "(20.4) Expect 'paused', but got '$STATUS'"
1076 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1077 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1079 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1080 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1081 mdd.${MDT_DEV}.lfsck_namespace |
1082 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1084 error "(22) unexpected status"
1087 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1088 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1089 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1091 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1092 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1093 mdd.${MDT_DEV}.lfsck_namespace |
1094 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1096 error "(24) unexpected status"
1099 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1100 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1102 run_test 8 "LFSCK state machine"
1105 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1106 skip "Testing on UP system, the speed may be inaccurate."
1110 check_mount_and_prep
1111 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1112 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1113 createmany -o $DIR/$tdir/lfsck/f 5000
1115 local BASE_SPEED1=100
1117 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1120 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1121 [ "$STATUS" == "scanning-phase1" ] ||
1122 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1124 local SPEED=$($SHOW_LAYOUT |
1125 awk '/^average_speed_phase1/ { print $2 }')
1127 # There may be time error, normally it should be less than 2 seconds.
1128 # We allow another 20% schedule error.
1130 # MAX_MARGIN = 1.3 = 13 / 10
1131 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1132 RUN_TIME1 * 13 / 10))
1133 [ $SPEED -lt $MAX_SPEED ] || {
1135 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1136 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1139 # adjust speed limit
1140 local BASE_SPEED2=300
1142 do_facet $SINGLEMDS \
1143 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1146 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1147 # MIN_MARGIN = 0.7 = 7 / 10
1148 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1149 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1150 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1151 [ $SPEED -gt $MIN_SPEED ] || {
1152 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1153 error_ignore LU-5624 \
1154 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1157 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1161 # MAX_MARGIN = 1.3 = 13 / 10
1162 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1163 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1164 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1165 [ $SPEED -lt $MAX_SPEED ] || {
1167 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1168 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1169 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1172 do_nodes $(comma_list $(mdts_nodes)) \
1173 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1174 do_nodes $(comma_list $(osts_nodes)) \
1175 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS \
1178 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1179 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1180 error "(7) Failed to get expected 'completed'"
1182 run_test 9a "LFSCK speed control (1)"
1185 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1186 skip "Testing on UP system, the speed may be inaccurate."
1192 echo "Preparing another 50 * 50 files (with error) at $(date)."
1193 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1195 createmany -d $DIR/$tdir/d 50
1196 createmany -m $DIR/$tdir/f 50
1197 for ((i = 0; i < 50; i++)); do
1198 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1201 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1202 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1203 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1204 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1205 mdd.${MDT_DEV}.lfsck_namespace |
1206 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1208 error "(5) unexpected status"
1211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1212 echo "Prepared at $(date)."
1214 local BASE_SPEED1=50
1216 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1219 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1220 [ "$STATUS" == "scanning-phase2" ] ||
1221 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1223 local SPEED=$($SHOW_NAMESPACE |
1224 awk '/^average_speed_phase2/ { print $2 }')
1225 # There may be time error, normally it should be less than 2 seconds.
1226 # We allow another 20% schedule error.
1228 # MAX_MARGIN = 1.3 = 13 / 10
1229 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1230 RUN_TIME1 * 13 / 10))
1231 [ $SPEED -lt $MAX_SPEED ] || {
1233 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1234 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1237 # adjust speed limit
1238 local BASE_SPEED2=150
1240 do_facet $SINGLEMDS \
1241 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1244 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1245 # MIN_MARGIN = 0.7 = 7 / 10
1246 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1247 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1248 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1249 [ $SPEED -gt $MIN_SPEED ] || {
1250 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1251 error_ignore LU-5624 \
1252 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1255 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1259 # MAX_MARGIN = 1.3 = 13 / 10
1260 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1261 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1262 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1263 [ $SPEED -lt $MAX_SPEED ] || {
1265 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1266 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1267 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1270 do_nodes $(comma_list $(mdts_nodes)) \
1271 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1272 do_nodes $(comma_list $(osts_nodes)) \
1273 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1274 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1275 mdd.${MDT_DEV}.lfsck_namespace |
1276 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1278 error "(11) unexpected status"
1281 run_test 9b "LFSCK speed control (2)"
1285 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1286 skip "lookup(..)/linkea on ZFS issue" && return
1290 echo "Preparing more files with error at $(date)."
1291 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1294 for ((i = 0; i < 1000; i = $((i+2)))); do
1295 mkdir -p $DIR/$tdir/d${i}
1296 touch $DIR/$tdir/f${i}
1297 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1300 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1301 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1303 for ((i = 1; i < 1000; i = $((i+2)))); do
1304 mkdir -p $DIR/$tdir/d${i}
1305 touch $DIR/$tdir/f${i}
1306 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1309 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1310 echo "Prepared at $(date)."
1312 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1314 umount_client $MOUNT
1315 mount_client $MOUNT || error "(3) Fail to start client!"
1317 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1320 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1321 [ "$STATUS" == "scanning-phase1" ] ||
1322 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1324 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1326 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1328 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1330 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1332 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1334 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1336 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1338 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1339 error "(14) Fail to softlink!"
1341 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1342 [ "$STATUS" == "scanning-phase1" ] ||
1343 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1345 do_nodes $(comma_list $(mdts_nodes)) \
1346 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1347 do_nodes $(comma_list $(osts_nodes)) \
1348 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1349 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1350 mdd.${MDT_DEV}.lfsck_namespace |
1351 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1353 error "(16) unexpected status"
1356 run_test 10 "System is available during LFSCK scanning"
1359 ost_remove_lastid() {
1362 local rcmd="do_facet ost${ost}"
1364 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1366 # step 1: local mount
1367 mount_fstype ost${ost} || return 1
1368 # step 2: remove the specified LAST_ID
1369 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1371 unmount_fstype ost${ost} || return 2
1375 check_mount_and_prep
1376 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1377 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1382 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1384 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1385 error "(2) Fail to start ost1"
1387 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1388 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1390 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1391 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1393 wait_update_facet ost1 "$LCTL get_param -n \
1394 obdfilter.${OST_DEV}.lfsck_layout |
1395 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1397 error "(5) unexpected status"
1400 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1402 wait_update_facet ost1 "$LCTL get_param -n \
1403 obdfilter.${OST_DEV}.lfsck_layout |
1404 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1406 error "(6) unexpected status"
1409 echo "the LAST_ID(s) should have been rebuilt"
1410 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1411 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1413 run_test 11a "LFSCK can rebuild lost last_id"
1416 check_mount_and_prep
1417 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1419 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1420 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1421 do_facet ost1 $LCTL set_param fail_loc=0x160d
1423 local count=$(precreated_ost_obj_count 0 0)
1425 createmany -o $DIR/$tdir/f $((count + 32))
1427 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1428 local seq=$(do_facet mds1 $LCTL get_param -n \
1429 osp.${proc_path}.prealloc_last_seq)
1430 local id_used=$(do_facet mds1 $LCTL get_param -n \
1431 osp.${proc_path}.prealloc_last_id)
1433 umount_client $MOUNT
1434 stop ost1 || error "(1) Fail to stop ost1"
1436 #define OBD_FAIL_OST_ENOSPC 0x215
1437 do_facet ost1 $LCTL set_param fail_loc=0x215
1439 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1440 error "(2) Fail to start ost1"
1442 for ((i = 0; i < 60; i++)); do
1443 id_ost1=$(do_facet ost1 \
1444 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1445 awk -F: "/$seq/ { print \$2 }")
1446 [ -n "$id_ost1" ] && break
1450 echo "the on-disk LAST_ID should be smaller than the expected one"
1451 [ $id_used -gt $id_ost1 ] ||
1452 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1454 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1455 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1457 wait_update_facet ost1 \
1458 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1459 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1461 error "(6) unexpected status"
1464 stop ost1 || error "(7) Fail to stop ost1"
1466 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1467 error "(8) Fail to start ost1"
1469 echo "the on-disk LAST_ID should have been rebuilt"
1470 # last_id may be larger than $id_used if objects were created/skipped
1471 wait_update_facet_cond ost1 \
1472 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1473 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1474 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1475 error "(9) expect last_id >= id_used $seq:$id_used"
1478 do_facet ost1 $LCTL set_param fail_loc=0
1479 stopall || error "(10) Fail to stopall"
1481 run_test 11b "LFSCK can rebuild crashed last_id"
1484 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1486 check_mount_and_prep
1487 for k in $(seq $MDSCOUNT); do
1488 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1489 createmany -o $DIR/$tdir/${k}/f 100 ||
1490 error "(0) Fail to create 100 files."
1493 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1494 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1495 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1497 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1498 wait_all_targets namespace scanning-phase1 3
1500 echo "Stop namespace LFSCK on all targets by single lctl command."
1501 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1502 error "(4) Fail to stop LFSCK on all devices!"
1504 echo "All the LFSCK targets should be in 'stopped' status."
1505 wait_all_targets_blocked namespace stopped 5
1507 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1508 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1509 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1511 echo "All the LFSCK targets should be in 'completed' status."
1512 wait_all_targets_blocked namespace completed 7
1514 start_full_debug_logging
1516 echo "Start layout LFSCK on all targets by single command (-s 1)."
1517 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1518 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1520 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1521 wait_all_targets layout scanning-phase1 9
1523 echo "Stop layout LFSCK on all targets by single lctl command."
1524 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1525 error "(10) Fail to stop LFSCK on all devices!"
1527 echo "All the LFSCK targets should be in 'stopped' status."
1528 wait_all_targets_blocked layout stopped 11
1530 for k in $(seq $OSTCOUNT); do
1531 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1532 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1533 awk '/^status/ { print $2 }')
1534 [ "$STATUS" == "stopped" ] ||
1535 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1538 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1539 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1540 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1542 echo "All the LFSCK targets should be in 'completed' status."
1543 wait_all_targets_blocked layout completed 14
1545 stop_full_debug_logging
1547 run_test 12a "single command to trigger LFSCK on all devices"
1550 check_mount_and_prep
1552 echo "Start LFSCK without '-M' specified."
1553 do_facet mds1 $LCTL lfsck_start -A -r ||
1554 error "(0) Fail to start LFSCK without '-M'"
1556 wait_all_targets_blocked namespace completed 1
1557 wait_all_targets_blocked layout completed 2
1559 local count=$(do_facet mds1 $LCTL dl |
1560 awk '{ print $3 }' | grep mdt | wc -l)
1561 if [ $count -gt 1 ]; then
1563 echo "Start layout LFSCK on the node with multipe targets,"
1564 echo "but not specify '-M'/'-A' option. Should get failure."
1566 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1567 error "(3) Start layout LFSCK should fail" || true
1570 run_test 12b "auto detect Lustre device"
1574 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1575 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1576 echo "MDT-object FID."
1579 check_mount_and_prep
1581 echo "Inject failure stub to simulate bad lmm_oi"
1582 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1584 createmany -o $DIR/$tdir/f 1
1585 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1586 error "(0) Fail to create PFL $DIR/$tdir/f1"
1587 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1589 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1590 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1593 mdd.${MDT_DEV}.lfsck_layout |
1594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1596 error "(2) unexpected status"
1599 local repaired=$($SHOW_LAYOUT |
1600 awk '/^repaired_others/ { print $2 }')
1601 [ $repaired -eq 2 ] ||
1602 error "(3) Fail to repair crashed lmm_oi: $repaired"
1604 run_test 13 "LFSCK can repair crashed lmm_oi"
1608 echo "The OST-object referenced by the MDT-object should be there;"
1609 echo "otherwise, the LFSCK should re-create the missing OST-object."
1610 echo "without '--delay-create-ostobj' option."
1613 check_mount_and_prep
1614 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1616 echo "Inject failure stub to simulate dangling referenced MDT-object"
1617 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1618 do_facet ost1 $LCTL set_param fail_loc=0x1610
1619 local count=$(precreated_ost_obj_count 0 0)
1621 createmany -o $DIR/$tdir/f $((count + 16)) ||
1622 error "(0.1) Fail to create $DIR/$tdir/fx"
1623 touch $DIR/$tdir/guard0
1625 for ((i = 0; i < 16; i++)); do
1626 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1627 $DIR/$tdir/f_comp${i} ||
1628 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1630 touch $DIR/$tdir/guard1
1632 do_facet ost1 $LCTL set_param fail_loc=0
1634 start_full_debug_logging
1636 # exhaust other pre-created dangling cases
1637 count=$(precreated_ost_obj_count 0 0)
1638 createmany -o $DIR/$tdir/a $count ||
1639 error "(0.5) Fail to create $count files."
1641 echo "'ls' should fail because of dangling referenced MDT-object"
1642 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1644 echo "Trigger layout LFSCK to find out dangling reference"
1645 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1647 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1648 mdd.${MDT_DEV}.lfsck_layout |
1649 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1651 error "(3) unexpected status"
1654 local repaired=$($SHOW_LAYOUT |
1655 awk '/^repaired_dangling/ { print $2 }')
1656 [ $repaired -ge 32 ] ||
1657 error "(4) Fail to repair dangling reference: $repaired"
1659 echo "'stat' should fail because of not repair dangling by default"
1660 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1661 error "(5.1) stat should fail"
1662 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1663 error "(5.2) stat should fail"
1665 echo "Trigger layout LFSCK to repair dangling reference"
1666 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1668 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1669 mdd.${MDT_DEV}.lfsck_layout |
1670 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1672 error "(7) unexpected status"
1675 # There may be some async LFSCK updates in processing, wait for
1676 # a while until the target reparation has been done. LU-4970.
1678 echo "'stat' should success after layout LFSCK repairing"
1679 wait_update_facet client "stat $DIR/$tdir/guard0 |
1680 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1681 stat $DIR/$tdir/guard0
1683 error "(8.1) unexpected size"
1686 wait_update_facet client "stat $DIR/$tdir/guard1 |
1687 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1688 stat $DIR/$tdir/guard1
1690 error "(8.2) unexpected size"
1693 repaired=$($SHOW_LAYOUT |
1694 awk '/^repaired_dangling/ { print $2 }')
1695 [ $repaired -ge 32 ] ||
1696 error "(9) Fail to repair dangling reference: $repaired"
1698 stop_full_debug_logging
1700 echo "stopall to cleanup object cache"
1703 setupall > /dev/null
1705 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1709 echo "The OST-object referenced by the MDT-object should be there;"
1710 echo "otherwise, the LFSCK should re-create the missing OST-object."
1711 echo "with '--delay-create-ostobj' option."
1714 check_mount_and_prep
1715 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1717 echo "Inject failure stub to simulate dangling referenced MDT-object"
1718 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1719 do_facet ost1 $LCTL set_param fail_loc=0x1610
1720 local count=$(precreated_ost_obj_count 0 0)
1722 createmany -o $DIR/$tdir/f $((count + 31))
1723 touch $DIR/$tdir/guard
1724 do_facet ost1 $LCTL set_param fail_loc=0
1726 start_full_debug_logging
1728 # exhaust other pre-created dangling cases
1729 count=$(precreated_ost_obj_count 0 0)
1730 createmany -o $DIR/$tdir/a $count ||
1731 error "(0) Fail to create $count files."
1733 echo "'ls' should fail because of dangling referenced MDT-object"
1734 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1736 echo "Trigger layout LFSCK to find out dangling reference"
1737 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1739 wait_all_targets_blocked layout completed 3
1741 local repaired=$($SHOW_LAYOUT |
1742 awk '/^repaired_dangling/ { print $2 }')
1743 [ $repaired -ge 32 ] ||
1744 error "(4) Fail to repair dangling reference: $repaired"
1746 echo "'stat' should fail because of not repair dangling by default"
1747 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1749 echo "Trigger layout LFSCK to repair dangling reference"
1750 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1752 wait_all_targets_blocked layout completed 7
1754 # There may be some async LFSCK updates in processing, wait for
1755 # a while until the target reparation has been done. LU-4970.
1757 echo "'stat' should success after layout LFSCK repairing"
1758 wait_update_facet client "stat $DIR/$tdir/guard |
1759 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1760 stat $DIR/$tdir/guard
1762 error "(8) unexpected size"
1765 repaired=$($SHOW_LAYOUT |
1766 awk '/^repaired_dangling/ { print $2 }')
1767 [ $repaired -ge 32 ] ||
1768 error "(9) Fail to repair dangling reference: $repaired"
1770 stop_full_debug_logging
1772 echo "stopall to cleanup object cache"
1775 setupall > /dev/null
1777 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1781 echo "If the OST-object referenced by the MDT-object back points"
1782 echo "to some non-exist MDT-object, then the LFSCK should repair"
1783 echo "the OST-object to back point to the right MDT-object."
1786 check_mount_and_prep
1787 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1789 echo "Inject failure stub to make the OST-object to back point to"
1790 echo "non-exist MDT-object."
1791 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1793 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1794 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1795 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1797 error "(0) Fail to create PFL $DIR/$tdir/f1"
1798 # 'dd' will trigger punch RPC firstly on every OST-objects.
1799 # So even though some OST-object will not be write by 'dd',
1800 # as long as it is allocated (may be NOT allocated in pfl_3b)
1801 # its layout information will be set also.
1802 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1803 cancel_lru_locks osc
1804 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1806 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1807 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1809 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1810 mdd.${MDT_DEV}.lfsck_layout |
1811 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1813 error "(2) unexpected status"
1816 local repaired=$($SHOW_LAYOUT |
1817 awk '/^repaired_unmatched_pair/ { print $2 }')
1818 [ $repaired -ge 3 ] ||
1819 error "(3) Fail to repair unmatched pair: $repaired"
1821 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1825 echo "If the OST-object referenced by the MDT-object back points"
1826 echo "to other MDT-object that doesn't recognize the OST-object,"
1827 echo "then the LFSCK should repair it to back point to the right"
1828 echo "MDT-object (the first one)."
1831 check_mount_and_prep
1832 mkdir -p $DIR/$tdir/0
1833 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1834 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1835 cancel_lru_locks osc
1837 echo "Inject failure stub to make the OST-object to back point to"
1838 echo "other MDT-object"
1841 [ $OSTCOUNT -ge 2 ] && stripes=2
1843 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1844 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1845 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1846 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1848 error "(0) Fail to create PFL $DIR/$tdir/f1"
1849 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1850 cancel_lru_locks osc
1851 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1853 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1854 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1856 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1857 mdd.${MDT_DEV}.lfsck_layout |
1858 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1860 error "(2) unexpected status"
1863 local repaired=$($SHOW_LAYOUT |
1864 awk '/^repaired_unmatched_pair/ { print $2 }')
1865 [ $repaired -eq 4 ] ||
1866 error "(3) Fail to repair unmatched pair: $repaired"
1868 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1871 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1873 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1874 skip "Skip the test after 2.7.55 see LU-6437" && return
1877 echo "According to current metadata migration implementation,"
1878 echo "before the old MDT-object is removed, both the new MDT-object"
1879 echo "and old MDT-object will reference the same LOV layout. Then if"
1880 echo "the layout LFSCK finds the new MDT-object by race, it will"
1881 echo "regard related OST-object(s) as multiple referenced case, and"
1882 echo "will try to create new OST-object(s) for the new MDT-object."
1883 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1884 echo "MDT-object before confirm the multiple referenced case."
1887 check_mount_and_prep
1888 $LFS mkdir -i 1 $DIR/$tdir/a1
1889 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1890 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1891 cancel_lru_locks osc
1893 echo "Inject failure stub on MDT1 to delay the migration"
1895 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1896 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1897 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1898 $LFS migrate -m 0 $DIR/$tdir/a1 &
1901 echo "Trigger layout LFSCK to race with the migration"
1902 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1904 wait_all_targets_blocked layout completed 2
1906 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1907 local repaired=$($SHOW_LAYOUT |
1908 awk '/^repaired_unmatched_pair/ { print $2 }')
1909 [ $repaired -eq 1 ] ||
1910 error "(3) Fail to repair unmatched pair: $repaired"
1912 repaired=$($SHOW_LAYOUT |
1913 awk '/^repaired_multiple_referenced/ { print $2 }')
1914 [ $repaired -eq 0 ] ||
1915 error "(4) Unexpectedly repaird multiple references: $repaired"
1917 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1921 echo "If the OST-object's owner information does not match the owner"
1922 echo "information stored in the MDT-object, then the LFSCK trust the"
1923 echo "MDT-object and update the OST-object's owner information."
1926 check_mount_and_prep
1927 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1928 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1929 cancel_lru_locks osc
1931 # created but no setattr or write to the file.
1933 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1934 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1936 echo "Inject failure stub to skip OST-object owner changing"
1937 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1939 chown 1.1 $DIR/$tdir/f0
1940 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1942 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1945 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1947 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1948 mdd.${MDT_DEV}.lfsck_layout |
1949 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1951 error "(2) unexpected status"
1954 local repaired=$($SHOW_LAYOUT |
1955 awk '/^repaired_inconsistent_owner/ { print $2 }')
1956 [ $repaired -eq 1 ] ||
1957 error "(3) Fail to repair inconsistent owner: $repaired"
1959 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1963 echo "If more than one MDT-objects reference the same OST-object,"
1964 echo "and the OST-object only recognizes one MDT-object, then the"
1965 echo "LFSCK should create new OST-objects for such non-recognized"
1969 check_mount_and_prep
1970 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1972 echo "Inject failure stub to make two MDT-objects to refernce"
1973 echo "the OST-object"
1975 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1976 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1977 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1978 cancel_lru_locks mdc
1979 cancel_lru_locks osc
1981 createmany -o $DIR/$tdir/f 1
1982 cancel_lru_locks mdc
1983 cancel_lru_locks osc
1985 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1987 error "(0) Fail to create PFL $DIR/$tdir/f1"
1988 cancel_lru_locks mdc
1989 cancel_lru_locks osc
1990 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1992 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1993 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1994 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1995 [ $size -eq 1048576 ] ||
1996 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1998 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1999 [ $size -eq 1048576 ] ||
2000 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2002 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2005 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2008 mdd.${MDT_DEV}.lfsck_layout |
2009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2011 error "(3) unexpected status"
2014 local repaired=$($SHOW_LAYOUT |
2015 awk '/^repaired_multiple_referenced/ { print $2 }')
2016 [ $repaired -eq 2 ] ||
2017 error "(4) Fail to repair multiple references: $repaired"
2019 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2020 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2021 error "(5) Fail to write f0."
2022 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2023 [ $size -eq 1048576 ] ||
2024 error "(6) guard size should be 1048576, but got $size"
2026 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2027 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2028 error "(7) Fail to write f1."
2029 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2030 [ $size -eq 1048576 ] ||
2031 error "(8) guard size should be 1048576, but got $size"
2033 run_test 17 "LFSCK can repair multiple references"
2035 $LCTL set_param debug=+cache > /dev/null
2039 echo "The target MDT-object is there, but related stripe information"
2040 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2041 echo "layout EA entries."
2044 check_mount_and_prep
2045 $LFS mkdir -i 0 $DIR/$tdir/a1
2046 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2047 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2049 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2051 $LFS path2fid $DIR/$tdir/a1/f1
2052 $LFS getstripe $DIR/$tdir/a1/f1
2054 if [ $MDSCOUNT -ge 2 ]; then
2055 $LFS mkdir -i 1 $DIR/$tdir/a2
2056 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2057 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2058 $LFS path2fid $DIR/$tdir/a2/f2
2059 $LFS getstripe $DIR/$tdir/a2/f2
2062 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2063 error "(0) Fail to create PFL $DIR/$tdir/f3"
2065 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2067 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2069 $LFS path2fid $DIR/$tdir/f3
2070 $LFS getstripe $DIR/$tdir/f3
2072 cancel_lru_locks osc
2074 echo "Inject failure, to make the MDT-object lost its layout EA"
2075 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2076 do_facet mds1 $LCTL set_param fail_loc=0x1615
2077 chown 1.1 $DIR/$tdir/a1/f1
2079 if [ $MDSCOUNT -ge 2 ]; then
2080 do_facet mds2 $LCTL set_param fail_loc=0x1615
2081 chown 1.1 $DIR/$tdir/a2/f2
2084 chown 1.1 $DIR/$tdir/f3
2089 do_facet mds1 $LCTL set_param fail_loc=0
2090 if [ $MDSCOUNT -ge 2 ]; then
2091 do_facet mds2 $LCTL set_param fail_loc=0
2094 cancel_lru_locks mdc
2095 cancel_lru_locks osc
2097 echo "The file size should be incorrect since layout EA is lost"
2098 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2099 [ "$cur_size" != "$saved_size1" ] ||
2100 error "(1) Expect incorrect file1 size"
2102 if [ $MDSCOUNT -ge 2 ]; then
2103 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2104 [ "$cur_size" != "$saved_size1" ] ||
2105 error "(2) Expect incorrect file2 size"
2108 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2109 [ "$cur_size" != "$saved_size2" ] ||
2110 error "(1.2) Expect incorrect file3 size"
2112 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2113 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2115 for k in $(seq $MDSCOUNT); do
2116 # The LFSCK status query internal is 30 seconds. For the case
2117 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2118 # time to guarantee the status sync up.
2119 wait_update_facet mds${k} "$LCTL get_param -n \
2120 mdd.$(facet_svc mds${k}).lfsck_layout |
2121 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2122 error "(4) MDS${k} is not the expected 'completed'"
2125 for k in $(seq $OSTCOUNT); do
2126 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2127 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2128 awk '/^status/ { print $2 }')
2129 [ "$cur_status" == "completed" ] ||
2130 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2133 local repaired=$(do_facet mds1 $LCTL get_param -n \
2134 mdd.$(facet_svc mds1).lfsck_layout |
2135 awk '/^repaired_orphan/ { print $2 }')
2136 [ $repaired -eq 3 ] ||
2137 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2139 if [ $MDSCOUNT -ge 2 ]; then
2140 repaired=$(do_facet mds2 $LCTL get_param -n \
2141 mdd.$(facet_svc mds2).lfsck_layout |
2142 awk '/^repaired_orphan/ { print $2 }')
2143 [ $repaired -eq 2 ] ||
2144 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2147 $LFS path2fid $DIR/$tdir/a1/f1
2148 $LFS getstripe $DIR/$tdir/a1/f1
2150 if [ $MDSCOUNT -ge 2 ]; then
2151 $LFS path2fid $DIR/$tdir/a2/f2
2152 $LFS getstripe $DIR/$tdir/a2/f2
2155 $LFS path2fid $DIR/$tdir/f3
2156 $LFS getstripe $DIR/$tdir/f3
2158 echo "The file size should be correct after layout LFSCK scanning"
2159 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2160 [ "$cur_size" == "$saved_size1" ] ||
2161 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2163 if [ $MDSCOUNT -ge 2 ]; then
2164 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2165 [ "$cur_size" == "$saved_size1" ] ||
2166 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2169 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2170 [ "$cur_size" == "$saved_size2" ] ||
2171 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2173 run_test 18a "Find out orphan OST-object and repair it (1)"
2176 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2179 echo "The target MDT-object is lost. The LFSCK should re-create the"
2180 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2181 echo "can move it back to normal namespace manually."
2184 check_mount_and_prep
2185 $LFS mkdir -i 0 $DIR/$tdir/a1
2186 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2187 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2188 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2189 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2191 $LFS getstripe $DIR/$tdir/a1/f1
2193 if [ $MDSCOUNT -ge 2 ]; then
2194 $LFS mkdir -i 1 $DIR/$tdir/a2
2195 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2196 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2197 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2199 $LFS getstripe $DIR/$tdir/a2/f2
2202 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2203 error "(0) Fail to create PFL $DIR/$tdir/f3"
2205 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2207 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2208 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2210 $LFS getstripe $DIR/$tdir/f3
2212 cancel_lru_locks osc
2214 echo "Inject failure, to simulate the case of missing the MDT-object"
2215 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2216 do_facet mds1 $LCTL set_param fail_loc=0x1616
2217 rm -f $DIR/$tdir/a1/f1
2219 if [ $MDSCOUNT -ge 2 ]; then
2220 do_facet mds2 $LCTL set_param fail_loc=0x1616
2221 rm -f $DIR/$tdir/a2/f2
2229 do_facet mds1 $LCTL set_param fail_loc=0
2230 if [ $MDSCOUNT -ge 2 ]; then
2231 do_facet mds2 $LCTL set_param fail_loc=0
2234 cancel_lru_locks mdc
2235 cancel_lru_locks osc
2237 # dryrun mode only check orphans, not repaie
2238 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2239 $START_LAYOUT --dryrun -o -r ||
2240 error "Fail to start layout LFSCK in dryrun mode"
2241 wait_all_targets_blocked layout completed 2
2243 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2244 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2245 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2247 local orphans=$(do_facet mds1 $LCTL get_param -n \
2248 mdd.$(facet_svc mds1).lfsck_layout |
2249 awk '/^inconsistent_orphan/ { print $2 }')
2250 [ $orphans -eq 3 ] ||
2251 error "Expect 3 found on mds1, but got: $orphans"
2253 # orphan parents should not be created
2255 for subdir in $MOUNT/.lustre/lost+found/*; do
2256 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2259 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2260 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2262 for k in $(seq $MDSCOUNT); do
2263 # The LFSCK status query internal is 30 seconds. For the case
2264 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2265 # time to guarantee the status sync up.
2266 wait_update_facet mds${k} "$LCTL get_param -n \
2267 mdd.$(facet_svc mds${k}).lfsck_layout |
2268 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2269 error "(2) MDS${k} is not the expected 'completed'"
2272 for k in $(seq $OSTCOUNT); do
2273 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2274 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2275 awk '/^status/ { print $2 }')
2276 [ "$cur_status" == "completed" ] ||
2277 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2280 local repaired=$(do_facet mds1 $LCTL get_param -n \
2281 mdd.$(facet_svc mds1).lfsck_layout |
2282 awk '/^repaired_orphan/ { print $2 }')
2283 [ $repaired -eq 3 ] ||
2284 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2286 if [ $MDSCOUNT -ge 2 ]; then
2287 repaired=$(do_facet mds2 $LCTL get_param -n \
2288 mdd.$(facet_svc mds2).lfsck_layout |
2289 awk '/^repaired_orphan/ { print $2 }')
2290 [ $repaired -eq 2 ] ||
2291 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2294 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2295 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2296 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2298 if [ $MDSCOUNT -ge 2 ]; then
2299 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2300 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2303 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2304 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2306 $LFS path2fid $DIR/$tdir/a1/f1
2307 $LFS getstripe $DIR/$tdir/a1/f1
2309 if [ $MDSCOUNT -ge 2 ]; then
2310 $LFS path2fid $DIR/$tdir/a2/f2
2311 $LFS getstripe $DIR/$tdir/a2/f2
2314 $LFS path2fid $DIR/$tdir/f3
2315 $LFS getstripe $DIR/$tdir/f3
2317 echo "The file size should be correct after layout LFSCK scanning"
2318 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2319 [ "$cur_size" == "$saved_size1" ] ||
2320 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2322 if [ $MDSCOUNT -ge 2 ]; then
2323 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2324 [ "$cur_size" == "$saved_size1" ] ||
2325 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2328 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2329 [ "$cur_size" == "$saved_size2" ] ||
2330 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2332 run_test 18b "Find out orphan OST-object and repair it (2)"
2335 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2338 echo "The target MDT-object is lost, and the OST-object FID is missing."
2339 echo "The LFSCK should re-create the MDT-object with new FID under the "
2340 echo "directory .lustre/lost+found/MDTxxxx."
2343 check_mount_and_prep
2344 $LFS mkdir -i 0 $DIR/$tdir/a1
2345 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2347 echo "Inject failure, to simulate the case of missing parent FID"
2348 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2349 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2351 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2352 $LFS getstripe $DIR/$tdir/a1/f1
2354 if [ $MDSCOUNT -ge 2 ]; then
2355 $LFS mkdir -i 1 $DIR/$tdir/a2
2356 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2357 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2358 $LFS getstripe $DIR/$tdir/a2/f2
2361 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2362 error "(0) Fail to create PFL $DIR/$tdir/f3"
2364 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2365 $LFS getstripe $DIR/$tdir/f3
2367 cancel_lru_locks osc
2368 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2370 echo "Inject failure, to simulate the case of missing the MDT-object"
2371 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2372 do_facet mds1 $LCTL set_param fail_loc=0x1616
2373 rm -f $DIR/$tdir/a1/f1
2375 if [ $MDSCOUNT -ge 2 ]; then
2376 do_facet mds2 $LCTL set_param fail_loc=0x1616
2377 rm -f $DIR/$tdir/a2/f2
2385 do_facet mds1 $LCTL set_param fail_loc=0
2386 if [ $MDSCOUNT -ge 2 ]; then
2387 do_facet mds2 $LCTL set_param fail_loc=0
2390 cancel_lru_locks mdc
2391 cancel_lru_locks osc
2393 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2394 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2396 for k in $(seq $MDSCOUNT); do
2397 # The LFSCK status query internal is 30 seconds. For the case
2398 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2399 # time to guarantee the status sync up.
2400 wait_update_facet mds${k} "$LCTL get_param -n \
2401 mdd.$(facet_svc mds${k}).lfsck_layout |
2402 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2403 error "(2) MDS${k} is not the expected 'completed'"
2406 for k in $(seq $OSTCOUNT); do
2407 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2408 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2409 awk '/^status/ { print $2 }')
2410 [ "$cur_status" == "completed" ] ||
2411 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2414 if [ $MDSCOUNT -ge 2 ]; then
2420 local repaired=$(do_facet mds1 $LCTL get_param -n \
2421 mdd.$(facet_svc mds1).lfsck_layout |
2422 awk '/^repaired_orphan/ { print $2 }')
2423 [ $repaired -eq $expected ] ||
2424 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2426 if [ $MDSCOUNT -ge 2 ]; then
2427 repaired=$(do_facet mds2 $LCTL get_param -n \
2428 mdd.$(facet_svc mds2).lfsck_layout |
2429 awk '/^repaired_orphan/ { print $2 }')
2430 [ $repaired -eq 0 ] ||
2431 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2434 ls -ail $MOUNT/.lustre/lost+found/
2436 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2437 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2438 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2440 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2443 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2444 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2445 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2447 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2448 [ ! -z "$cname" ] ||
2449 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2451 run_test 18c "Find out orphan OST-object and repair it (3)"
2455 echo "The target MDT-object layout EA is corrupted, but the right"
2456 echo "OST-object is still alive as orphan. The layout LFSCK will"
2457 echo "not create new OST-object to occupy such slot."
2460 check_mount_and_prep
2462 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2463 echo "guard" > $DIR/$tdir/a1/f1
2464 echo "foo" > $DIR/$tdir/a1/f2
2466 echo "guard" > $DIR/$tdir/a1/f3
2467 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2468 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2469 echo "foo" > $DIR/$tdir/a1/f4
2471 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2472 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2473 $LFS path2fid $DIR/$tdir/a1/f1
2474 $LFS getstripe $DIR/$tdir/a1/f1
2475 $LFS path2fid $DIR/$tdir/a1/f2
2476 $LFS getstripe $DIR/$tdir/a1/f2
2477 $LFS path2fid $DIR/$tdir/a1/f3
2478 $LFS getstripe $DIR/$tdir/a1/f3
2479 $LFS path2fid $DIR/$tdir/a1/f4
2480 $LFS getstripe $DIR/$tdir/a1/f4
2481 cancel_lru_locks osc
2483 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2484 echo "to reference the same OST-object (which is f1's OST-obejct)."
2485 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2486 echo "dangling reference case, but f2's old OST-object is there."
2488 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2489 echo "to reference the same OST-object (which is f3's OST-obejct)."
2490 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2491 echo "dangling reference case, but f4's old OST-object is there."
2494 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2495 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2496 chown 1.1 $DIR/$tdir/a1/f2
2497 chown 1.1 $DIR/$tdir/a1/f4
2498 rm -f $DIR/$tdir/a1/f1
2499 rm -f $DIR/$tdir/a1/f3
2502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2504 echo "stopall to cleanup object cache"
2507 setupall > /dev/null
2509 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2510 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2512 for k in $(seq $MDSCOUNT); do
2513 # The LFSCK status query internal is 30 seconds. For the case
2514 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2515 # time to guarantee the status sync up.
2516 wait_update_facet mds${k} "$LCTL get_param -n \
2517 mdd.$(facet_svc mds${k}).lfsck_layout |
2518 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2519 error "(3) MDS${k} is not the expected 'completed'"
2522 for k in $(seq $OSTCOUNT); do
2523 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2524 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2525 awk '/^status/ { print $2 }')
2526 [ "$cur_status" == "completed" ] ||
2527 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2530 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2531 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2532 awk '/^repaired_orphan/ { print $2 }')
2533 [ $repaired -eq 2 ] ||
2534 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2536 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2537 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2538 awk '/^repaired_dangling/ { print $2 }')
2539 [ $repaired -eq 0 ] ||
2540 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2542 echo "The file size should be correct after layout LFSCK scanning"
2543 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2544 [ "$cur_size" == "$saved_size1" ] ||
2545 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2547 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2548 [ "$cur_size" == "$saved_size2" ] ||
2549 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2551 echo "The LFSCK should find back the original data."
2552 cat $DIR/$tdir/a1/f2
2553 $LFS path2fid $DIR/$tdir/a1/f2
2554 $LFS getstripe $DIR/$tdir/a1/f2
2555 cat $DIR/$tdir/a1/f4
2556 $LFS path2fid $DIR/$tdir/a1/f4
2557 $LFS getstripe $DIR/$tdir/a1/f4
2559 run_test 18d "Find out orphan OST-object and repair it (4)"
2562 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2565 echo "The target MDT-object layout EA slot is occpuied by some new"
2566 echo "created OST-object when repair dangling reference case. Such"
2567 echo "conflict OST-object has been modified by others. To keep the"
2568 echo "new data, the LFSCK will create a new file to refernece this"
2569 echo "old orphan OST-object."
2572 check_mount_and_prep
2574 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2575 echo "guard" > $DIR/$tdir/a1/f1
2576 echo "foo" > $DIR/$tdir/a1/f2
2578 echo "guard" > $DIR/$tdir/a1/f3
2579 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2580 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2581 echo "foo" > $DIR/$tdir/a1/f4
2583 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2584 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2586 $LFS path2fid $DIR/$tdir/a1/f1
2587 $LFS getstripe $DIR/$tdir/a1/f1
2588 $LFS path2fid $DIR/$tdir/a1/f2
2589 $LFS getstripe $DIR/$tdir/a1/f2
2590 $LFS path2fid $DIR/$tdir/a1/f3
2591 $LFS getstripe $DIR/$tdir/a1/f3
2592 $LFS path2fid $DIR/$tdir/a1/f4
2593 $LFS getstripe $DIR/$tdir/a1/f4
2594 cancel_lru_locks osc
2596 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2597 echo "to reference the same OST-object (which is f1's OST-obejct)."
2598 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2599 echo "dangling reference case, but f2's old OST-object is there."
2601 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2602 echo "to reference the same OST-object (which is f3's OST-obejct)."
2603 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2604 echo "dangling reference case, but f4's old OST-object is there."
2607 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2608 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2609 chown 1.1 $DIR/$tdir/a1/f2
2610 chown 1.1 $DIR/$tdir/a1/f4
2611 rm -f $DIR/$tdir/a1/f1
2612 rm -f $DIR/$tdir/a1/f3
2615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2617 echo "stopall to cleanup object cache"
2620 setupall > /dev/null
2622 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2623 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2625 start_full_debug_logging
2627 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2628 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2630 wait_update_facet mds1 "$LCTL get_param -n \
2631 mdd.$(facet_svc mds1).lfsck_layout |
2632 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2633 error "(3) MDS1 is not the expected 'scanning-phase2'"
2635 # to guarantee all updates are synced.
2639 echo "Write new data to f2/f4 to modify the new created OST-object."
2640 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2641 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2643 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2645 for k in $(seq $MDSCOUNT); do
2646 # The LFSCK status query internal is 30 seconds. For the case
2647 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2648 # time to guarantee the status sync up.
2649 wait_update_facet mds${k} "$LCTL get_param -n \
2650 mdd.$(facet_svc mds${k}).lfsck_layout |
2651 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2652 error "(4) MDS${k} is not the expected 'completed'"
2655 for k in $(seq $OSTCOUNT); do
2656 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2657 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2658 awk '/^status/ { print $2 }')
2659 [ "$cur_status" == "completed" ] ||
2660 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2663 stop_full_debug_logging
2665 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2666 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2667 awk '/^repaired_orphan/ { print $2 }')
2668 [ $repaired -eq 2 ] ||
2669 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2671 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2672 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2673 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2675 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2676 if [ $count -ne 2 ]; then
2677 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2678 error "(8) Expect 2 stubs under lost+found, but got $count"
2681 echo "The stub file should keep the original f2 or f4 data"
2682 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2683 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2684 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2685 error "(9) Got unexpected $cur_size"
2688 $LFS path2fid $cname
2689 $LFS getstripe $cname
2691 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2692 cur_size=$(ls -il $cname | awk '{ print $6 }')
2693 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2694 error "(10) Got unexpected $cur_size"
2697 $LFS path2fid $cname
2698 $LFS getstripe $cname
2700 echo "The f2/f4 should contains new data."
2701 cat $DIR/$tdir/a1/f2
2702 $LFS path2fid $DIR/$tdir/a1/f2
2703 $LFS getstripe $DIR/$tdir/a1/f2
2704 cat $DIR/$tdir/a1/f4
2705 $LFS path2fid $DIR/$tdir/a1/f4
2706 $LFS getstripe $DIR/$tdir/a1/f4
2708 run_test 18e "Find out orphan OST-object and repair it (5)"
2711 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2714 echo "The target MDT-object is lost. The LFSCK should re-create the"
2715 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2716 echo "to verify some OST-object(s) during the first stage-scanning,"
2717 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2718 echo "should not be affected."
2721 check_mount_and_prep
2722 $LFS mkdir -i 0 $DIR/$tdir/a1
2723 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2724 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2725 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2726 $LFS mkdir -i 0 $DIR/$tdir/a2
2727 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2728 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2729 $LFS getstripe $DIR/$tdir/a1/f1
2730 $LFS getstripe $DIR/$tdir/a2/f2
2732 if [ $MDSCOUNT -ge 2 ]; then
2733 $LFS mkdir -i 1 $DIR/$tdir/a3
2734 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2735 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2736 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2737 $LFS mkdir -i 1 $DIR/$tdir/a4
2738 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2739 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2740 $LFS getstripe $DIR/$tdir/a3/f3
2741 $LFS getstripe $DIR/$tdir/a4/f4
2744 cancel_lru_locks osc
2746 echo "Inject failure, to simulate the case of missing the MDT-object"
2747 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2748 do_facet mds1 $LCTL set_param fail_loc=0x1616
2749 rm -f $DIR/$tdir/a1/f1
2750 rm -f $DIR/$tdir/a2/f2
2752 if [ $MDSCOUNT -ge 2 ]; then
2753 do_facet mds2 $LCTL set_param fail_loc=0x1616
2754 rm -f $DIR/$tdir/a3/f3
2755 rm -f $DIR/$tdir/a4/f4
2761 do_facet mds1 $LCTL set_param fail_loc=0
2762 if [ $MDSCOUNT -ge 2 ]; then
2763 do_facet mds2 $LCTL set_param fail_loc=0
2766 cancel_lru_locks mdc
2767 cancel_lru_locks osc
2769 echo "Inject failure, to simulate the OST0 fail to handle"
2770 echo "MDT0 LFSCK request during the first-stage scanning."
2771 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2772 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2774 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2775 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2777 for k in $(seq $MDSCOUNT); do
2778 # The LFSCK status query internal is 30 seconds. For the case
2779 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2780 # time to guarantee the status sync up.
2781 wait_update_facet mds${k} "$LCTL get_param -n \
2782 mdd.$(facet_svc mds${k}).lfsck_layout |
2783 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2784 error "(2) MDS${k} is not the expected 'partial'"
2787 wait_update_facet ost1 "$LCTL get_param -n \
2788 obdfilter.$(facet_svc ost1).lfsck_layout |
2789 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2790 error "(3) OST1 is not the expected 'partial'"
2793 wait_update_facet ost2 "$LCTL get_param -n \
2794 obdfilter.$(facet_svc ost2).lfsck_layout |
2795 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2796 error "(4) OST2 is not the expected 'completed'"
2799 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2801 local repaired=$(do_facet mds1 $LCTL get_param -n \
2802 mdd.$(facet_svc mds1).lfsck_layout |
2803 awk '/^repaired_orphan/ { print $2 }')
2804 [ $repaired -eq 1 ] ||
2805 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2807 if [ $MDSCOUNT -ge 2 ]; then
2808 repaired=$(do_facet mds2 $LCTL get_param -n \
2809 mdd.$(facet_svc mds2).lfsck_layout |
2810 awk '/^repaired_orphan/ { print $2 }')
2811 [ $repaired -eq 1 ] ||
2812 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2815 echo "Trigger layout LFSCK on all devices again to cleanup"
2816 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2818 for k in $(seq $MDSCOUNT); do
2819 # The LFSCK status query internal is 30 seconds. For the case
2820 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2821 # time to guarantee the status sync up.
2822 wait_update_facet mds${k} "$LCTL get_param -n \
2823 mdd.$(facet_svc mds${k}).lfsck_layout |
2824 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2825 error "(8) MDS${k} is not the expected 'completed'"
2828 for k in $(seq $OSTCOUNT); do
2829 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2830 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2831 awk '/^status/ { print $2 }')
2832 [ "$cur_status" == "completed" ] ||
2833 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2837 local repaired=$(do_facet mds1 $LCTL get_param -n \
2838 mdd.$(facet_svc mds1).lfsck_layout |
2839 awk '/^repaired_orphan/ { print $2 }')
2840 [ $repaired -eq 2 ] ||
2841 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2843 if [ $MDSCOUNT -ge 2 ]; then
2844 repaired=$(do_facet mds2 $LCTL get_param -n \
2845 mdd.$(facet_svc mds2).lfsck_layout |
2846 awk '/^repaired_orphan/ { print $2 }')
2847 [ $repaired -eq 2 ] ||
2848 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2851 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2854 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2857 echo "The target MDT-object is lost, but related OI mapping is there"
2858 echo "The LFSCK should recreate the lost MDT-object without affected"
2859 echo "by the stale OI mapping."
2862 check_mount_and_prep
2863 $LFS mkdir -i 0 $DIR/$tdir/a1
2864 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2865 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2866 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2868 $LFS getstripe $DIR/$tdir/a1/f1
2869 cancel_lru_locks osc
2871 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2872 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2873 do_facet mds1 $LCTL set_param fail_loc=0x162e
2874 rm -f $DIR/$tdir/a1/f1
2876 do_facet mds1 $LCTL set_param fail_loc=0
2877 cancel_lru_locks mdc
2878 cancel_lru_locks osc
2880 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2881 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2883 for k in $(seq $MDSCOUNT); do
2884 # The LFSCK status query internal is 30 seconds. For the case
2885 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2886 # time to guarantee the status sync up.
2887 wait_update_facet mds${k} "$LCTL get_param -n \
2888 mdd.$(facet_svc mds${k}).lfsck_layout |
2889 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2890 error "(2) MDS${k} is not the expected 'completed'"
2893 for k in $(seq $OSTCOUNT); do
2894 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2895 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2896 awk '/^status/ { print $2 }')
2897 [ "$cur_status" == "completed" ] ||
2898 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2901 local repaired=$(do_facet mds1 $LCTL get_param -n \
2902 mdd.$(facet_svc mds1).lfsck_layout |
2903 awk '/^repaired_orphan/ { print $2 }')
2904 [ $repaired -eq $OSTCOUNT ] ||
2905 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2907 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2908 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2909 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2911 $LFS path2fid $DIR/$tdir/a1/f1
2912 $LFS getstripe $DIR/$tdir/a1/f1
2914 run_test 18g "Find out orphan OST-object and repair it (7)"
2918 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2919 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2920 echo "scanning its OST-object(s). Then in the second stage scanning,"
2921 echo "the OST will return related OST-object(s) to the MDT as orphan."
2922 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2923 echo "the 'orphan(s)' stripe information."
2926 check_mount_and_prep
2928 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2929 error "(0) Fail to create PFL $DIR/$tdir/f0"
2931 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2932 error "(1.1) Fail to write $DIR/$tdir/f0"
2934 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2935 error "(1.2) Fail to write $DIR/$tdir/f0"
2937 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2939 echo "Inject failure stub to simulate bad PFL extent range"
2940 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2943 chown 1.1 $DIR/$tdir/f0
2945 cancel_lru_locks mdc
2946 cancel_lru_locks osc
2947 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2949 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2950 error "(2) Write to bad PFL file should fail"
2952 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2953 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2955 for k in $(seq $MDSCOUNT); do
2956 # The LFSCK status query internal is 30 seconds. For the case
2957 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2958 # time to guarantee the status sync up.
2959 wait_update_facet mds${k} "$LCTL get_param -n \
2960 mdd.$(facet_svc mds${k}).lfsck_layout |
2961 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2962 error "(4.1) MDS${k} is not the expected 'completed'"
2965 for k in $(seq $OSTCOUNT); do
2966 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2967 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2968 awk '/^status/ { print $2 }')
2969 [ "$cur_status" == "completed" ] ||
2970 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2974 local repaired=$($SHOW_LAYOUT |
2975 awk '/^repaired_orphan/ { print $2 }')
2976 [ $repaired -eq 2 ] ||
2977 error "(5) Fail to repair crashed PFL range: $repaired"
2979 echo "Data in $DIR/$tdir/f0 should not be broken"
2980 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2981 error "(6) Data in $DIR/$tdir/f0 is broken"
2983 echo "Write should succeed after LFSCK repairing the bad PFL range"
2984 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2985 error "(7) Write should succeed after LFSCK"
2987 run_test 18h "LFSCK can repair crashed PFL extent range"
2989 $LCTL set_param debug=-cache > /dev/null
2992 check_mount_and_prep
2993 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2995 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2996 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2998 echo "foo1" > $DIR/$tdir/a0
2999 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3000 error "(0) Fail to create PFL $DIR/$tdir/a1"
3001 echo "foo2" > $DIR/$tdir/a1
3002 echo "guard" > $DIR/$tdir/a2
3003 cancel_lru_locks osc
3005 echo "Inject failure, then client will offer wrong parent FID when read"
3006 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3007 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3009 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3010 $LCTL set_param fail_loc=0x1619
3012 echo "Read RPC with wrong parent FID should be denied"
3013 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3014 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3015 $LCTL set_param fail_loc=0
3017 run_test 19a "OST-object inconsistency self detect"
3020 check_mount_and_prep
3021 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3023 echo "Inject failure stub to make the OST-object to back point to"
3024 echo "non-exist MDT-object"
3026 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3027 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3029 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3030 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3031 echo "foo1" > $DIR/$tdir/f0
3032 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3033 error "(0) Fail to create PFL $DIR/$tdir/f1"
3034 echo "foo2" > $DIR/$tdir/f1
3035 cancel_lru_locks osc
3036 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3038 do_facet ost1 $LCTL set_param -n \
3039 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3040 echo "Nothing should be fixed since self detect and repair is disabled"
3041 local repaired=$(do_facet ost1 $LCTL get_param -n \
3042 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3043 awk '/^repaired/ { print $2 }')
3044 [ $repaired -eq 0 ] ||
3045 error "(1) Expected 0 repaired, but got $repaired"
3047 echo "Read RPC with right parent FID should be accepted,"
3048 echo "and cause parent FID on OST to be fixed"
3050 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3051 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3053 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3054 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3056 repaired=$(do_facet ost1 $LCTL get_param -n \
3057 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3058 awk '/^repaired/ { print $2 }')
3059 [ $repaired -eq 2 ] ||
3060 error "(3) Expected 1 repaired, but got $repaired"
3062 run_test 19b "OST-object inconsistency self repair"
3064 PATTERN_WITH_HOLE="40000001"
3065 PATTERN_WITHOUT_HOLE="raid0"
3068 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3069 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3072 echo "The target MDT-object and some of its OST-object are lost."
3073 echo "The LFSCK should find out the left OST-objects and re-create"
3074 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3075 echo "with the partial OST-objects (LOV EA hole)."
3077 echo "New client can access the file with LOV EA hole via normal"
3078 echo "system tools or commands without crash the system."
3080 echo "For old client, even though it cannot access the file with"
3081 echo "LOV EA hole, it should not cause the system crash."
3084 check_mount_and_prep
3085 $LFS mkdir -i 0 $DIR/$tdir/a1
3086 if [ $OSTCOUNT -gt 2 ]; then
3087 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3090 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3094 # 256 blocks on the stripe0.
3095 # 1 block on the stripe1 for 2 OSTs case.
3096 # 256 blocks on the stripe1 for other cases.
3097 # 1 block on the stripe2 if OSTs > 2
3098 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3099 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3100 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3102 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3103 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3104 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3107 $LFS getstripe $DIR/$tdir/a1/f0
3109 $LFS getstripe $DIR/$tdir/a1/f1
3111 $LFS getstripe $DIR/$tdir/a1/f2
3113 if [ $OSTCOUNT -gt 2 ]; then
3114 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3115 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3117 $LFS getstripe $DIR/$tdir/a1/f3
3120 cancel_lru_locks osc
3122 echo "Inject failure..."
3123 echo "To simulate f0 lost MDT-object"
3124 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3125 do_facet mds1 $LCTL set_param fail_loc=0x1616
3126 rm -f $DIR/$tdir/a1/f0
3128 echo "To simulate f1 lost MDT-object and OST-object0"
3129 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3130 do_facet mds1 $LCTL set_param fail_loc=0x161a
3131 rm -f $DIR/$tdir/a1/f1
3133 echo "To simulate f2 lost MDT-object and OST-object1"
3134 do_facet mds1 $LCTL set_param fail_val=1
3135 rm -f $DIR/$tdir/a1/f2
3137 if [ $OSTCOUNT -gt 2 ]; then
3138 echo "To simulate f3 lost MDT-object and OST-object2"
3139 do_facet mds1 $LCTL set_param fail_val=2
3140 rm -f $DIR/$tdir/a1/f3
3143 umount_client $MOUNT
3146 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3148 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3149 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3151 for k in $(seq $MDSCOUNT); do
3152 # The LFSCK status query internal is 30 seconds. For the case
3153 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3154 # time to guarantee the status sync up.
3155 wait_update_facet mds${k} "$LCTL get_param -n \
3156 mdd.$(facet_svc mds${k}).lfsck_layout |
3157 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3158 error "(2) MDS${k} is not the expected 'completed'"
3161 for k in $(seq $OSTCOUNT); do
3162 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3163 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3164 awk '/^status/ { print $2 }')
3165 [ "$cur_status" == "completed" ] ||
3166 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3169 local repaired=$(do_facet mds1 $LCTL get_param -n \
3170 mdd.$(facet_svc mds1).lfsck_layout |
3171 awk '/^repaired_orphan/ { print $2 }')
3172 if [ $OSTCOUNT -gt 2 ]; then
3173 [ $repaired -eq 9 ] ||
3174 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3176 [ $repaired -eq 4 ] ||
3177 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3180 mount_client $MOUNT || error "(5.0) Fail to start client!"
3182 LOV_PATTERN_F_HOLE=0x40000000
3185 # ${fid0}-R-0 is the old f0
3187 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3188 echo "Check $name, which is the old f0"
3190 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3192 local pattern=$($LFS getstripe -L $name)
3193 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3194 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3196 local stripes=$($LFS getstripe -c $name)
3197 if [ $OSTCOUNT -gt 2 ]; then
3198 [ $stripes -eq 3 ] ||
3199 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3201 [ $stripes -eq 2 ] ||
3202 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3205 local size=$(stat $name | awk '/Size:/ { print $2 }')
3206 [ $size -eq $((4096 * $bcount)) ] ||
3207 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3209 cat $name > /dev/null || error "(5.5) cannot read $name"
3211 echo "dummy" >> $name || error "(5.6) cannot write $name"
3213 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3215 touch $name || error "(5.8) cannot touch $name"
3217 rm -f $name || error "(5.9) cannot unlink $name"
3220 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3222 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3223 if [ $OSTCOUNT -gt 2 ]; then
3224 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3226 echo "Check $name, it contains the old f1's stripe1"
3229 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3231 pattern=$($LFS getstripe -L $name)
3232 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3233 error "(6.2) expect pattern flag hole, but got $pattern"
3235 stripes=$($LFS getstripe -c $name)
3236 if [ $OSTCOUNT -gt 2 ]; then
3237 [ $stripes -eq 3 ] ||
3238 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3240 [ $stripes -eq 2 ] ||
3241 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3244 size=$(stat $name | awk '/Size:/ { print $2 }')
3245 [ $size -eq $((4096 * $bcount)) ] ||
3246 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3248 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3250 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3251 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3254 [ $failures -eq 256 ] ||
3255 error "(6.6) expect 256 IO failures, but get $failures"
3257 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3258 [ $size -eq $((4096 * $bcount)) ] ||
3259 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3261 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3262 error "(6.8) write to the LOV EA hole should fail"
3264 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3265 error "(6.9) write to normal stripe should NOT fail"
3267 echo "foo" >> $name && error "(6.10) append write $name should fail"
3269 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3271 touch $name || error "(6.12) cannot touch $name"
3273 rm -f $name || error "(6.13) cannot unlink $name"
3276 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3278 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3279 if [ $OSTCOUNT -gt 2 ]; then
3280 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3282 echo "Check $name, it contains the old f2's stripe0"
3285 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3287 pattern=$($LFS getstripe -L $name)
3288 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3289 error "(7.2) expect pattern flag hole, but got $pattern"
3291 stripes=$($LFS getstripe -c $name)
3292 size=$(stat $name | awk '/Size:/ { print $2 }')
3293 if [ $OSTCOUNT -gt 2 ]; then
3294 [ $stripes -eq 3 ] ||
3295 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3297 [ $size -eq $((4096 * $bcount)) ] ||
3298 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3300 cat $name > /dev/null &&
3301 error "(7.5.1) normal read $name should fail"
3303 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3304 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3306 [ $failures -eq 256 ] ||
3307 error "(7.6) expect 256 IO failures, but get $failures"
3309 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3310 [ $size -eq $((4096 * $bcount)) ] ||
3311 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3313 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3314 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3316 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3317 error "(7.8.1) write to normal stripe should NOT fail"
3319 echo "foo" >> $name &&
3320 error "(7.8.3) append write $name should fail"
3322 chown $RUNAS_ID:$RUNAS_GID $name ||
3323 error "(7.9.1) cannot chown on $name"
3325 touch $name || error "(7.10.1) cannot touch $name"
3327 [ $stripes -eq 2 ] ||
3328 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3331 [ $size -eq $((4096 * (256 + 0))) ] ||
3332 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3334 cat $name > /dev/null &&
3335 error "(7.5.2) normal read $name should fail"
3337 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3338 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3339 [ $failures -eq 256 ] ||
3340 error "(7.6.2) expect 256 IO failures, but get $failures"
3343 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3344 [ $size -eq $((4096 * $bcount)) ] ||
3345 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3347 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3348 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3350 chown $RUNAS_ID:$RUNAS_GID $name ||
3351 error "(7.9.2) cannot chown on $name"
3353 touch $name || error "(7.10.2) cannot touch $name"
3356 rm -f $name || error "(7.11) cannot unlink $name"
3358 [ $OSTCOUNT -le 2 ] && return
3361 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3363 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3364 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3366 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3368 pattern=$($LFS getstripe -L $name)
3369 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3370 error "(8.2) expect pattern flag hole, but got $pattern"
3372 stripes=$($LFS getstripe -c $name)
3373 [ $stripes -eq 3 ] ||
3374 error "(8.3) expect the stripe count is 3, but got $stripes"
3376 size=$(stat $name | awk '/Size:/ { print $2 }')
3378 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3379 error "(8.4) expect the size $((4096 * 512)), but got $size"
3381 cat $name > /dev/null &&
3382 error "(8.5) normal read $name should fail"
3384 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3385 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3387 [ $failures -eq 256 ] ||
3388 error "(8.6) expect 256 IO failures, but get $failures"
3391 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3392 [ $size -eq $((4096 * $bcount)) ] ||
3393 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3395 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3396 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3398 chown $RUNAS_ID:$RUNAS_GID $name ||
3399 error "(8.9) cannot chown on $name"
3401 touch $name || error "(8.10) cannot touch $name"
3403 rm -f $name || error "(8.11) cannot unlink $name"
3405 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3408 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3409 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3412 echo "The target MDT-object and some of its OST-object are lost."
3413 echo "The LFSCK should find out the left OST-objects and re-create"
3414 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3415 echo "with the partial OST-objects (LOV EA hole)."
3417 echo "New client can access the file with LOV EA hole via normal"
3418 echo "system tools or commands without crash the system - PFL case."
3421 check_mount_and_prep
3423 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3424 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3425 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3426 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3427 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3428 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3430 local bcount=$((256 * 3 + 1))
3432 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3433 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3434 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3436 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3437 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3438 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3441 $LFS getstripe $DIR/$tdir/f0
3443 $LFS getstripe $DIR/$tdir/f1
3445 $LFS getstripe $DIR/$tdir/f2
3447 cancel_lru_locks mdc
3448 cancel_lru_locks osc
3450 echo "Inject failure..."
3451 echo "To simulate f0 lost MDT-object"
3452 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3456 echo "To simulate the case of f1 lost MDT-object and "
3457 echo "the first OST-object in each PFL component"
3458 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3459 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3462 echo "To simulate the case of f2 lost MDT-object and "
3463 echo "the second OST-object in each PFL component"
3464 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3469 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3471 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3472 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3474 for k in $(seq $MDSCOUNT); do
3475 # The LFSCK status query internal is 30 seconds. For the case
3476 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3477 # time to guarantee the status sync up.
3478 wait_update_facet mds${k} "$LCTL get_param -n \
3479 mdd.$(facet_svc mds${k}).lfsck_layout |
3480 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3481 error "(4) MDS${k} is not the expected 'completed'"
3484 for k in $(seq $OSTCOUNT); do
3485 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3486 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3487 awk '/^status/ { print $2 }')
3488 [ "$cur_status" == "completed" ] ||
3489 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3492 local repaired=$(do_facet mds1 $LCTL get_param -n \
3493 mdd.$(facet_svc mds1).lfsck_layout |
3494 awk '/^repaired_orphan/ { print $2 }')
3495 [ $repaired -eq 8 ] ||
3496 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3499 # ${fid0}-R-0 is the old f0
3501 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3502 echo "Check $name, which is the old f0"
3504 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3506 local pattern=$($LFS getstripe -L -I1 $name)
3507 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3508 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3510 pattern=$($LFS getstripe -L -I2 $name)
3511 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3512 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3514 local stripes=$($LFS getstripe -c -I1 $name)
3515 [ $stripes -eq 2 ] ||
3516 error "(7.3.1) expect 2 stripes, but got $stripes"
3518 stripes=$($LFS getstripe -c -I2 $name)
3519 [ $stripes -eq 2 ] ||
3520 error "(7.3.2) expect 2 stripes, but got $stripes"
3522 local e_start=$($LFS getstripe -I1 $name |
3523 awk '/lcme_extent.e_start:/ { print $2 }')
3524 [ $e_start -eq 0 ] ||
3525 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3527 local e_end=$($LFS getstripe -I1 $name |
3528 awk '/lcme_extent.e_end:/ { print $2 }')
3529 [ $e_end -eq 2097152 ] ||
3530 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3532 e_start=$($LFS getstripe -I2 $name |
3533 awk '/lcme_extent.e_start:/ { print $2 }')
3534 [ $e_start -eq 2097152 ] ||
3535 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3537 e_end=$($LFS getstripe -I2 $name |
3538 awk '/lcme_extent.e_end:/ { print $2 }')
3539 [ "$e_end" = "EOF" ] ||
3540 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3542 local size=$(stat $name | awk '/Size:/ { print $2 }')
3543 [ $size -eq $((4096 * $bcount)) ] ||
3544 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3546 cat $name > /dev/null || error "(7.7) cannot read $name"
3548 echo "dummy" >> $name || error "(7.8) cannot write $name"
3550 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3552 touch $name || error "(7.10) cannot touch $name"
3554 rm -f $name || error "(7.11) cannot unlink $name"
3557 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3559 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3560 echo "Check $name, it contains f1's second OST-object in each COMP"
3562 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3564 pattern=$($LFS getstripe -L -I1 $name)
3565 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3566 error "(8.2.1) expect pattern flag hole, but got $pattern"
3568 pattern=$($LFS getstripe -L -I2 $name)
3569 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3570 error "(8.2.2) expect pattern flag hole, but got $pattern"
3572 stripes=$($LFS getstripe -c -I1 $name)
3573 [ $stripes -eq 2 ] ||
3574 error "(8.3.2) expect 2 stripes, but got $stripes"
3576 stripes=$($LFS getstripe -c -I2 $name)
3577 [ $stripes -eq 2 ] ||
3578 error "(8.3.2) expect 2 stripes, but got $stripes"
3580 e_start=$($LFS getstripe -I1 $name |
3581 awk '/lcme_extent.e_start:/ { print $2 }')
3582 [ $e_start -eq 0 ] ||
3583 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3585 e_end=$($LFS getstripe -I1 $name |
3586 awk '/lcme_extent.e_end:/ { print $2 }')
3587 [ $e_end -eq 2097152 ] ||
3588 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3590 e_start=$($LFS getstripe -I2 $name |
3591 awk '/lcme_extent.e_start:/ { print $2 }')
3592 [ $e_start -eq 2097152 ] ||
3593 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3595 e_end=$($LFS getstripe -I2 $name |
3596 awk '/lcme_extent.e_end:/ { print $2 }')
3597 [ "$e_end" = "EOF" ] ||
3598 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3600 size=$(stat $name | awk '/Size:/ { print $2 }')
3601 [ $size -eq $((4096 * $bcount)) ] ||
3602 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3604 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3606 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3607 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3609 # The first stripe in each COMP was lost
3610 [ $failures -eq 512 ] ||
3611 error "(8.8) expect 512 IO failures, but get $failures"
3613 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3614 [ $size -eq $((4096 * $bcount)) ] ||
3615 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3617 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3618 error "(8.10) write to the LOV EA hole should fail"
3620 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3621 error "(8.11) write to normal stripe should NOT fail"
3623 echo "foo" >> $name && error "(8.12) append write $name should fail"
3625 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3627 touch $name || error "(8.14) cannot touch $name"
3629 rm -f $name || error "(8.15) cannot unlink $name"
3632 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3634 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3635 echo "Check $name, it contains f2's first stripe in each COMP"
3637 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3639 pattern=$($LFS getstripe -L -I1 $name)
3640 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3641 error "(9.2.1) expect pattern flag hole, but got $pattern"
3643 pattern=$($LFS getstripe -L -I2 $name)
3644 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3645 error "(9.2.2) expect pattern flag hole, but got $pattern"
3647 stripes=$($LFS getstripe -c -I1 $name)
3648 [ $stripes -eq 2 ] ||
3649 error "(9.3.2) expect 2 stripes, but got $stripes"
3651 stripes=$($LFS getstripe -c -I2 $name)
3652 [ $stripes -eq 2 ] ||
3653 error "(9.3.2) expect 2 stripes, but got $stripes"
3655 e_start=$($LFS getstripe -I1 $name |
3656 awk '/lcme_extent.e_start:/ { print $2 }')
3657 [ $e_start -eq 0 ] ||
3658 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3660 e_end=$($LFS getstripe -I1 $name |
3661 awk '/lcme_extent.e_end:/ { print $2 }')
3662 [ $e_end -eq 2097152 ] ||
3663 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3665 e_start=$($LFS getstripe -I2 $name |
3666 awk '/lcme_extent.e_start:/ { print $2 }')
3667 [ $e_start -eq 2097152 ] ||
3668 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3670 e_end=$($LFS getstripe -I2 $name |
3671 awk '/lcme_extent.e_end:/ { print $2 }')
3672 [ "$e_end" = "EOF" ] ||
3673 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3675 size=$(stat $name | awk '/Size:/ { print $2 }')
3676 # The second stripe in COMP was lost, so we do not know there
3677 # have ever been some data before. 'stat' will regard it as
3678 # no data on the lost stripe.
3680 [ $size -eq $((4096 * $bcount)) ] ||
3681 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3683 cat $name > /dev/null &&
3684 error "(9.7) normal read $name should fail"
3686 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3687 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3688 [ $failures -eq 512 ] ||
3689 error "(9.8) expect 256 IO failures, but get $failures"
3691 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3692 # The second stripe in COMP was lost, so we do not know there
3693 # have ever been some data before. Since 'dd' skip failure,
3694 # it will regard the lost stripe contains data.
3696 [ $size -eq $((4096 * $bcount)) ] ||
3697 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3699 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3700 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3702 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3703 error "(9.11) write to normal stripe should NOT fail"
3705 echo "foo" >> $name &&
3706 error "(9.12) append write $name should fail"
3708 chown $RUNAS_ID:$RUNAS_GID $name ||
3709 error "(9.13) cannot chown on $name"
3711 touch $name || error "(9.14) cannot touch $name"
3713 rm -f $name || error "(7.15) cannot unlink $name"
3715 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3718 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3719 skip "ignore the test if MDS is older than 2.5.59" && return
3721 check_mount_and_prep
3722 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3724 echo "Start all LFSCK components by default (-s 1)"
3725 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3726 error "Fail to start LFSCK"
3728 echo "namespace LFSCK should be in 'scanning-phase1' status"
3729 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3730 [ "$STATUS" == "scanning-phase1" ] ||
3731 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3733 echo "layout LFSCK should be in 'scanning-phase1' status"
3734 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3735 [ "$STATUS" == "scanning-phase1" ] ||
3736 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3738 echo "Stop all LFSCK components by default"
3739 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3740 error "Fail to stop LFSCK"
3742 run_test 21 "run all LFSCK components by default"
3745 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3748 echo "The parent_A references the child directory via some name entry,"
3749 echo "but the child directory back references another parent_B via its"
3750 echo "".." name entry. The parent_B does not exist. Then the namespace"
3751 echo "LFSCK will repair the child directory's ".." name entry."
3754 check_mount_and_prep
3756 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3757 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3759 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3760 echo "The dummy's dotdot name entry references the guard."
3761 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3762 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3763 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3764 error "(3) Fail to mkdir on MDT0"
3765 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3767 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3769 echo "Trigger namespace LFSCK to repair unmatched pairs"
3770 $START_NAMESPACE -A -r ||
3771 error "(5) Fail to start LFSCK for namespace"
3773 wait_all_targets_blocked namespace completed 6
3775 local repaired=$($SHOW_NAMESPACE |
3776 awk '/^unmatched_pairs_repaired/ { print $2 }')
3777 [ $repaired -eq 1 ] ||
3778 error "(7) Fail to repair unmatched pairs: $repaired"
3780 echo "'ls' should success after namespace LFSCK repairing"
3781 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3782 error "(8) ls should success."
3784 run_test 22a "LFSCK can repair unmatched pairs (1)"
3787 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3790 echo "The parent_A references the child directory via the name entry_B,"
3791 echo "but the child directory back references another parent_C via its"
3792 echo "".." name entry. The parent_C exists, but there is no the name"
3793 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3794 echo "the child directory's ".." name entry and its linkEA."
3797 check_mount_and_prep
3799 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3800 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3802 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3803 echo "and bad linkEA. The dummy's dotdot name entry references the"
3804 echo "guard. The dummy's linkEA references n non-exist name entry."
3805 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3806 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3807 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3808 error "(3) Fail to mkdir on MDT0"
3809 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3811 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3812 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3813 local dummyname=$($LFS fid2path $DIR $dummyfid)
3814 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3815 error "(4) fid2path works unexpectedly."
3817 echo "Trigger namespace LFSCK to repair unmatched pairs"
3818 $START_NAMESPACE -A -r ||
3819 error "(5) Fail to start LFSCK for namespace"
3821 wait_all_targets_blocked namespace completed 6
3823 local repaired=$($SHOW_NAMESPACE |
3824 awk '/^unmatched_pairs_repaired/ { print $2 }')
3825 [ $repaired -eq 1 ] ||
3826 error "(7) Fail to repair unmatched pairs: $repaired"
3828 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3829 local dummyname=$($LFS fid2path $DIR $dummyfid)
3830 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3831 error "(8) fid2path does not work"
3833 run_test 22b "LFSCK can repair unmatched pairs (2)"
3836 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3839 echo "The name entry is there, but the MDT-object for such name "
3840 echo "entry does not exist. The namespace LFSCK should find out "
3841 echo "and repair the inconsistency as required."
3844 check_mount_and_prep
3846 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3847 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3849 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3850 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3851 do_facet mds2 $LCTL set_param fail_loc=0x1620
3852 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3853 do_facet mds2 $LCTL set_param fail_loc=0
3855 echo "'ls' should fail because of dangling name entry"
3856 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3858 echo "Trigger namespace LFSCK to find out dangling name entry"
3859 $START_NAMESPACE -A -r ||
3860 error "(5) Fail to start LFSCK for namespace"
3862 wait_all_targets_blocked namespace completed 6
3864 local repaired=$($SHOW_NAMESPACE |
3865 awk '/^dangling_repaired/ { print $2 }')
3866 [ $repaired -eq 1 ] ||
3867 error "(7) Fail to repair dangling name entry: $repaired"
3869 echo "'ls' should fail because not re-create MDT-object by default"
3870 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3872 echo "Trigger namespace LFSCK again to repair dangling name entry"
3873 $START_NAMESPACE -A -r -C ||
3874 error "(9) Fail to start LFSCK for namespace"
3876 wait_all_targets_blocked namespace completed 10
3878 repaired=$($SHOW_NAMESPACE |
3879 awk '/^dangling_repaired/ { print $2 }')
3880 [ $repaired -eq 1 ] ||
3881 error "(11) Fail to repair dangling name entry: $repaired"
3883 echo "'ls' should success after namespace LFSCK repairing"
3884 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3886 run_test 23a "LFSCK can repair dangling name entry (1)"
3890 echo "The objectA has multiple hard links, one of them corresponding"
3891 echo "to the name entry_B. But there is something wrong for the name"
3892 echo "entry_B and cause entry_B to references non-exist object_C."
3893 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3894 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3895 echo "comes to the second-stage scanning, it will find that the"
3896 echo "former re-creating object_C is not proper, and will try to"
3897 echo "replace the object_C with the real object_A."
3900 check_mount_and_prep
3902 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3903 $LFS path2fid $DIR/$tdir/d0
3905 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3907 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3908 $LFS path2fid $DIR/$tdir/d0/f0
3910 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3911 $LFS path2fid $DIR/$tdir/d0/f1
3913 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3914 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3916 if [ "$SEQ0" != "$SEQ1" ]; then
3917 # To guarantee that the f0 and f1 are in the same FID seq
3918 rm -f $DIR/$tdir/d0/f0 ||
3919 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3920 echo "dummy" > $DIR/$tdir/d0/f0 ||
3921 error "(3.2) Fail to touch on MDT0"
3922 $LFS path2fid $DIR/$tdir/d0/f0
3925 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3926 OID=$(printf %d $OID)
3928 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3929 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3930 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3931 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3932 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3934 # If there is creation after the dangling injection, it may re-use
3935 # the just released local object (inode) that is referenced by the
3936 # dangling name entry. It will fail the dangling injection.
3937 # So before deleting the target object for the dangling name entry,
3938 # remove some other objects to avoid the target object being reused
3939 # by some potential creations. LU-7429
3940 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3942 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3944 echo "'ls' should fail because of dangling name entry"
3945 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3946 error "(6) ls should fail."
3948 echo "Trigger namespace LFSCK to find out dangling name entry"
3949 $START_NAMESPACE -r -C ||
3950 error "(7) Fail to start LFSCK for namespace"
3952 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3953 mdd.${MDT_DEV}.lfsck_namespace |
3954 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3956 error "(8) unexpected status"
3959 local repaired=$($SHOW_NAMESPACE |
3960 awk '/^dangling_repaired/ { print $2 }')
3961 [ $repaired -eq 1 ] ||
3962 error "(9) Fail to repair dangling name entry: $repaired"
3964 repaired=$($SHOW_NAMESPACE |
3965 awk '/^multiple_linked_repaired/ { print $2 }')
3966 [ $repaired -eq 1 ] ||
3967 error "(10) Fail to drop the former created object: $repaired"
3969 local data=$(cat $DIR/$tdir/d0/foo)
3970 [ "$data" == "dummy" ] ||
3971 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3973 run_test 23b "LFSCK can repair dangling name entry (2)"
3976 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3977 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3978 mdd.${MDT_DEV}.lfsck_namespace |
3979 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3981 error "(10) unexpected status"
3984 stop_full_debug_logging
3989 echo "The objectA has multiple hard links, one of them corresponding"
3990 echo "to the name entry_B. But there is something wrong for the name"
3991 echo "entry_B and cause entry_B to references non-exist object_C."
3992 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3993 echo "as dangling, and re-create the lost object_C. And then others"
3994 echo "modified the re-created object_C. When the LFSCK comes to the"
3995 echo "second-stage scanning, it will find that the former re-creating"
3996 echo "object_C maybe wrong and try to replace the object_C with the"
3997 echo "real object_A. But because object_C has been modified, so the"
3998 echo "LFSCK cannot replace it."
4001 start_full_debug_logging
4003 check_mount_and_prep
4005 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4006 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4007 echo "parent_fid=$parent_fid"
4009 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4011 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4012 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4013 echo "f0_fid=$f0_fid"
4015 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4016 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4017 echo "f1_fid=$f1_fid"
4019 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4020 # To guarantee that the f0 and f1 are in the same FID seq
4021 rm -f $DIR/$tdir/d0/f0 ||
4022 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4023 echo "dummy" > $DIR/$tdir/d0/f0 ||
4024 error "(3.2) Fail to touch on MDT0"
4025 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4026 echo "f0_fid=$f0_fid (replaced)"
4029 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4031 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4032 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4033 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4034 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4035 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4037 # If there is creation after the dangling injection, it may re-use
4038 # the just released local object (inode) that is referenced by the
4039 # dangling name entry. It will fail the dangling injection.
4040 # So before deleting the target object for the dangling name entry,
4041 # remove some other objects to avoid the target object being reused
4042 # by some potential creations. LU-7429
4043 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4045 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4047 echo "'ls' should fail because of dangling name entry"
4048 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4049 error "(6) ls should fail."
4051 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4052 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4054 echo "Trigger namespace LFSCK to find out dangling name entry"
4055 $START_NAMESPACE -r -C ||
4056 error "(7) Fail to start LFSCK for namespace"
4058 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4059 # While unexpected by the test, it is valid for LFSCK to repair
4060 # the link to the original object before any data is written.
4061 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4063 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4064 log "LFSCK repaired file prematurely"
4069 stat $DIR/$tdir/d0/foo
4071 error "(8) unexpected size"
4074 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4075 cancel_lru_locks osc
4079 local repaired=$($SHOW_NAMESPACE |
4080 awk '/^dangling_repaired/ { print $2 }')
4081 [ $repaired -eq 1 ] ||
4082 error "(11) Fail to repair dangling name entry: $repaired"
4084 local data=$(cat $DIR/$tdir/d0/foo)
4085 [ "$data" != "dummy" ] ||
4086 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4088 run_test 23c "LFSCK can repair dangling name entry (3)"
4091 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4092 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4095 echo "Two MDT-objects back reference the same name entry via their"
4096 echo "each own linkEA entry, but the name entry only references one"
4097 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4098 echo "for the MDT-object that is not recognized. If such MDT-object"
4099 echo "has no other linkEA entry after the removing, then the LFSCK"
4100 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4103 check_mount_and_prep
4105 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4107 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4108 $LFS path2fid $DIR/$tdir/d0/guard
4110 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4111 $LFS path2fid $DIR/$tdir/d0/dummy
4114 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4115 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4117 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4120 touch $DIR/$tdir/d0/guard/foo ||
4121 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4123 echo "Inject failure stub on MDT0 to simulate the case that"
4124 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4125 echo "that references $DIR/$tdir/d0/guard/foo."
4126 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4127 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4128 echo "there with the same linkEA entry as another MDT-object"
4129 echo "$DIR/$tdir/d0/guard/foo has"
4131 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4132 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4133 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4134 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4135 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4136 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4137 rmdir $DIR/$tdir/d0/dummy/foo ||
4138 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4139 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4141 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4142 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4143 error "(6) stat successfully unexpectedly"
4145 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4146 $START_NAMESPACE -A -r ||
4147 error "(7) Fail to start LFSCK for namespace"
4149 wait_all_targets_blocked namespace completed 8
4151 local repaired=$($SHOW_NAMESPACE |
4152 awk '/^multiple_referenced_repaired/ { print $2 }')
4153 [ $repaired -eq 1 ] ||
4154 error "(9) Fail to repair multiple referenced name entry: $repaired"
4156 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4157 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4158 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4160 local cname="$cfid-$pfid-D-0"
4161 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4162 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4164 run_test 24 "LFSCK can repair multiple-referenced name entry"
4167 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4168 skip "ldiskfs only test" && return
4171 echo "The file type in the name entry does not match the file type"
4172 echo "claimed by the referenced object. Then the LFSCK will update"
4173 echo "the file type in the name entry."
4176 check_mount_and_prep
4178 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4180 echo "Inject failure stub on MDT0 to simulate the case that"
4181 echo "the file type stored in the name entry is wrong."
4183 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4185 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4186 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4188 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4189 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4191 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4192 mdd.${MDT_DEV}.lfsck_namespace |
4193 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4195 error "(4) unexpected status"
4198 local repaired=$($SHOW_NAMESPACE |
4199 awk '/^bad_file_type_repaired/ { print $2 }')
4200 [ $repaired -eq 1 ] ||
4201 error "(5) Fail to repair bad file type in name entry: $repaired"
4203 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4205 run_test 25 "LFSCK can repair bad file type in the name entry"
4209 echo "The local name entry back referenced by the MDT-object is lost."
4210 echo "The namespace LFSCK will add the missing local name entry back"
4211 echo "to the normal namespace."
4214 check_mount_and_prep
4216 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4217 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4218 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4220 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4221 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4223 echo "Inject failure stub on MDT0 to simulate the case that"
4224 echo "foo's name entry will be removed, but the foo's object"
4225 echo "and its linkEA are kept in the system."
4227 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4228 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4229 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4230 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4232 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4233 error "(5) 'ls' should fail"
4235 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4236 $START_NAMESPACE -r -A ||
4237 error "(6) Fail to start LFSCK for namespace"
4239 wait_all_targets_blocked namespace completed 7
4241 local repaired=$($SHOW_NAMESPACE |
4242 awk '/^lost_dirent_repaired/ { print $2 }')
4243 [ $repaired -eq 1 ] ||
4244 error "(8) Fail to repair lost dirent: $repaired"
4246 ls -ail $DIR/$tdir/d0/foo ||
4247 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4249 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4250 [ "$foofid" == "$foofid2" ] ||
4251 error "(10) foo's FID changed: $foofid, $foofid2"
4253 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4256 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4259 echo "The remote name entry back referenced by the MDT-object is lost."
4260 echo "The namespace LFSCK will add the missing remote name entry back"
4261 echo "to the normal namespace."
4264 check_mount_and_prep
4266 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4267 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4268 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4270 echo "Inject failure stub on MDT0 to simulate the case that"
4271 echo "foo's name entry will be removed, but the foo's object"
4272 echo "and its linkEA are kept in the system."
4274 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4276 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4277 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4279 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4280 error "(4) 'ls' should fail"
4282 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4283 $START_NAMESPACE -r -A ||
4284 error "(5) Fail to start LFSCK for namespace"
4286 wait_all_targets_blocked namespace completed 6
4288 local repaired=$($SHOW_NAMESPACE |
4289 awk '/^lost_dirent_repaired/ { print $2 }')
4290 [ $repaired -eq 1 ] ||
4291 error "(7) Fail to repair lost dirent: $repaired"
4293 ls -ail $DIR/$tdir/d0/foo ||
4294 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4296 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4297 [ "$foofid" == "$foofid2" ] ||
4298 error "(9) foo's FID changed: $foofid, $foofid2"
4300 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4303 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4306 echo "The local parent referenced by the MDT-object linkEA is lost."
4307 echo "The namespace LFSCK will re-create the lost parent as orphan."
4310 check_mount_and_prep
4312 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4313 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4314 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4315 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4317 echo "Inject failure stub on MDT0 to simulate the case that"
4318 echo "foo's name entry will be removed, but the foo's object"
4319 echo "and its linkEA are kept in the system. And then remove"
4320 echo "another hard link and the parent directory."
4322 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4324 rm -f $DIR/$tdir/d0/foo ||
4325 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4326 rm -f $DIR/$tdir/d0/dummy ||
4327 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4328 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4330 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4331 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4333 echo "Trigger namespace LFSCK to repair the lost parent"
4334 $START_NAMESPACE -r -A ||
4335 error "(6) Fail to start LFSCK for namespace"
4337 wait_all_targets_blocked namespace completed 7
4339 local repaired=$($SHOW_NAMESPACE |
4340 awk '/^lost_dirent_repaired/ { print $2 }')
4341 [ $repaired -eq 1 ] ||
4342 error "(8) Fail to repair lost dirent: $repaired"
4344 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4345 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4346 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4348 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4350 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4351 [ ! -z "$cname" ] ||
4352 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4354 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4357 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4358 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4361 echo "The remote parent referenced by the MDT-object linkEA is lost."
4362 echo "The namespace LFSCK will re-create the lost parent as orphan."
4365 check_mount_and_prep
4367 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4368 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4370 $LFS path2fid $DIR/$tdir/d0
4372 echo "Inject failure stub on MDT0 to simulate the case that"
4373 echo "foo's name entry will be removed, but the foo's object"
4374 echo "and its linkEA are kept in the system. And then remove"
4375 echo "the parent directory."
4377 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4378 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4379 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4382 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4383 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4385 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4386 $START_NAMESPACE -r -A ||
4387 error "(6) Fail to start LFSCK for namespace"
4389 wait_all_targets_blocked namespace completed 7
4391 local repaired=$($SHOW_NAMESPACE |
4392 awk '/^lost_dirent_repaired/ { print $2 }')
4393 [ $repaired -eq 1 ] ||
4394 error "(8) Fail to repair lost dirent: $repaired"
4396 ls -ail $MOUNT/.lustre/lost+found/
4398 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4399 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4400 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4402 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4404 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4405 [ ! -z "$cname" ] ||
4406 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4408 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4411 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4414 echo "The target name entry is lost. The LFSCK should insert the"
4415 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4416 echo "the MDT (on which the orphan MDT-object resides) has ever"
4417 echo "failed to respond some name entry verification during the"
4418 echo "first stage-scanning, then the LFSCK should skip to handle"
4419 echo "orphan MDT-object on this MDT. But other MDTs should not"
4423 check_mount_and_prep
4424 $LFS mkdir -i 0 $DIR/$tdir/d1
4425 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4426 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4428 $LFS mkdir -i 1 $DIR/$tdir/d2
4429 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4430 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4432 echo "Inject failure stub on MDT0 to simulate the case that"
4433 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4434 echo "and its linkEA are kept in the system. And the case that"
4435 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4436 echo "and its linkEA are kept in the system."
4438 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4439 do_facet mds1 $LCTL set_param fail_loc=0x1624
4440 do_facet mds2 $LCTL set_param fail_loc=0x1624
4441 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4442 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4443 do_facet mds1 $LCTL set_param fail_loc=0
4444 do_facet mds2 $LCTL set_param fail_loc=0
4446 cancel_lru_locks mdc
4447 cancel_lru_locks osc
4449 echo "Inject failure, to simulate the MDT0 fail to handle"
4450 echo "MDT1 LFSCK request during the first-stage scanning."
4451 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4452 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4454 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4455 $START_NAMESPACE -r -A ||
4456 error "(3) Fail to start LFSCK for namespace"
4458 wait_update_facet mds1 "$LCTL get_param -n \
4459 mdd.$(facet_svc mds1).lfsck_namespace |
4460 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4461 error "(4) mds1 is not the expected 'partial'"
4464 wait_update_facet mds2 "$LCTL get_param -n \
4465 mdd.$(facet_svc mds2).lfsck_namespace |
4466 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4467 error "(5) mds2 is not the expected 'completed'"
4470 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4472 local repaired=$(do_facet mds1 $LCTL get_param -n \
4473 mdd.$(facet_svc mds1).lfsck_namespace |
4474 awk '/^lost_dirent_repaired/ { print $2 }')
4475 [ $repaired -eq 0 ] ||
4476 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4478 repaired=$(do_facet mds2 $LCTL get_param -n \
4479 mdd.$(facet_svc mds2).lfsck_namespace |
4480 awk '/^lost_dirent_repaired/ { print $2 }')
4481 [ $repaired -eq 1 ] ||
4482 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4484 echo "Trigger namespace LFSCK on all devices again to cleanup"
4485 $START_NAMESPACE -r -A ||
4486 error "(8) Fail to start LFSCK for namespace"
4488 wait_all_targets_blocked namespace completed 9
4490 local repaired=$(do_facet mds1 $LCTL get_param -n \
4491 mdd.$(facet_svc mds1).lfsck_namespace |
4492 awk '/^lost_dirent_repaired/ { print $2 }')
4493 [ $repaired -eq 1 ] ||
4494 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4496 repaired=$(do_facet mds2 $LCTL get_param -n \
4497 mdd.$(facet_svc mds2).lfsck_namespace |
4498 awk '/^lost_dirent_repaired/ { print $2 }')
4499 [ $repaired -eq 0 ] ||
4500 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4502 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4506 echo "The object's nlink attribute is larger than the object's known"
4507 echo "name entries count. The LFSCK will repair the object's nlink"
4508 echo "attribute to match the known name entries count"
4511 check_mount_and_prep
4513 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4514 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4516 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4517 echo "nlink attribute is larger than its name entries count."
4519 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4521 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4522 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4525 cancel_lru_locks mdc
4526 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4527 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4529 echo "Trigger namespace LFSCK to repair the nlink count"
4530 $START_NAMESPACE -r -A ||
4531 error "(5) Fail to start LFSCK for namespace"
4533 wait_all_targets_blocked namespace completed 6
4535 local repaired=$($SHOW_NAMESPACE |
4536 awk '/^nlinks_repaired/ { print $2 }')
4537 [ $repaired -eq 1 ] ||
4538 error "(7) Fail to repair nlink count: $repaired"
4540 cancel_lru_locks mdc
4541 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4542 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4544 # Disable 29a, we only allow nlink to be updated if the known linkEA
4545 # entries is larger than nlink count.
4547 #run_test 29a "LFSCK can repair bad nlink count (1)"
4551 echo "The object's nlink attribute is smaller than the object's known"
4552 echo "name entries count. The LFSCK will repair the object's nlink"
4553 echo "attribute to match the known name entries count"
4556 check_mount_and_prep
4558 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4559 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4561 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4562 echo "nlink attribute is smaller than its name entries count."
4564 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4565 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4566 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4567 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4570 cancel_lru_locks mdc
4571 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4572 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4574 echo "Trigger namespace LFSCK to repair the nlink count"
4575 $START_NAMESPACE -r -A ||
4576 error "(5) Fail to start LFSCK for namespace"
4578 wait_all_targets_blocked namespace completed 6
4580 local repaired=$($SHOW_NAMESPACE |
4581 awk '/^nlinks_repaired/ { print $2 }')
4582 [ $repaired -eq 1 ] ||
4583 error "(7) Fail to repair nlink count: $repaired"
4585 cancel_lru_locks mdc
4586 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4587 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4589 run_test 29b "LFSCK can repair bad nlink count (2)"
4594 echo "The namespace LFSCK will create many hard links to the target"
4595 echo "file as to exceed the linkEA size limitation. Under such case"
4596 echo "the linkEA will be marked as overflow that will prevent the"
4597 echo "target file to be migrated. Then remove some hard links to"
4598 echo "make the left hard links to be held within the linkEA size"
4599 echo "limitation. But before the namespace LFSCK adding all the"
4600 echo "missed linkEA entries back, the overflow mark (timestamp)"
4601 echo "will not be cleared."
4604 check_mount_and_prep
4606 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4607 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4608 error "(0.2) Fail to mkdir"
4609 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4610 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4612 # define MAX_LINKEA_SIZE 4096
4613 # sizeof(link_ea_header) = 24
4614 # sizeof(link_ea_entry) = 18
4615 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4616 # (sizeof(link_ea_entry) + name_length))
4617 # If the average name length is 12 bytes, then 150 hard links
4618 # is totally enough to overflow the linkEA
4619 echo "Create 150 hard links should succeed although the linkEA overflow"
4620 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4621 error "(2) Fail to hard link"
4623 cancel_lru_locks mdc
4624 if [ $MDSCOUNT -ge 2 ]; then
4625 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4626 error "(3.1) Migrate should fail"
4628 echo "The object with linkEA overflow should NOT be migrated"
4629 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4630 [ "$newfid" == "$oldfid" ] ||
4631 error "(3.2) Migrate should fail: $newfid != $oldfid"
4634 # Remove 100 hard links, then the linkEA should have space
4635 # to hold the missed linkEA entries.
4636 echo "Remove 100 hard links to save space for the missed linkEA entries"
4637 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4639 if [ $MDSCOUNT -ge 2 ]; then
4640 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4641 error "(5.1) Migrate should fail"
4643 # The overflow timestamp is still there, so migration will fail.
4644 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4645 [ "$newfid" == "$oldfid" ] ||
4646 error "(5.2) Migrate should fail: $newfid != $oldfid"
4649 # sleep 3 seconds to guarantee that the overflow is recognized
4652 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4653 $START_NAMESPACE -r -A ||
4654 error "(6) Fail to start LFSCK for namespace"
4656 wait_all_targets_blocked namespace completed 7
4658 local repaired=$($SHOW_NAMESPACE |
4659 awk '/^linkea_overflow_cleared/ { print $2 }')
4660 [ $repaired -eq 1 ] ||
4661 error "(8) Fail to clear linkea overflow: $repaired"
4663 repaired=$($SHOW_NAMESPACE |
4664 awk '/^nlinks_repaired/ { print $2 }')
4665 [ $repaired -eq 0 ] ||
4666 error "(9) Unexpected nlink repaired: $repaired"
4668 if [ $MDSCOUNT -ge 2 ]; then
4669 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4670 error "(10.1) Migrate failure"
4672 # Migration should succeed after clear the overflow timestamp.
4673 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4674 [ "$newfid" != "$oldfid" ] ||
4675 error "(10.2) Migrate should succeed"
4677 ls -l $DIR/$tdir/foo > /dev/null ||
4678 error "(11) 'ls' failed after migration"
4681 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4682 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4684 run_test 29c "verify linkEA size limitation"
4687 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4688 skip "ldiskfs only test" && return
4689 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4692 echo "The namespace LFSCK will move the orphans from backend"
4693 echo "/lost+found directory to normal client visible namespace"
4694 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4697 check_mount_and_prep
4699 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4700 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4702 echo "Inject failure stub on MDT0 to simulate the case that"
4703 echo "directory d0 has no linkEA entry, then the LFSCK will"
4704 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4706 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4708 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4709 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4711 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4712 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4714 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4715 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4717 echo "Inject failure stub on MDT0 to simulate the case that the"
4718 echo "object's name entry will be removed, but not destroy the"
4719 echo "object. Then backend e2fsck will handle it as orphan and"
4720 echo "add them into the backend /lost+found directory."
4722 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4724 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4725 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4726 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4727 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4728 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4730 umount_client $MOUNT || error "(10) Fail to stop client!"
4732 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4735 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4736 error "(12) Fail to run e2fsck"
4738 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4739 error "(13) Fail to start MDT0"
4741 echo "Trigger namespace LFSCK to recover backend orphans"
4742 $START_NAMESPACE -r -A ||
4743 error "(14) Fail to start LFSCK for namespace"
4745 wait_all_targets_blocked namespace completed 15
4747 local repaired=$($SHOW_NAMESPACE |
4748 awk '/^local_lost_found_moved/ { print $2 }')
4749 [ $repaired -ge 4 ] ||
4750 error "(16) Fail to recover backend orphans: $repaired"
4752 mount_client $MOUNT || error "(17) Fail to start client!"
4754 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4756 ls -ail $MOUNT/.lustre/lost+found/
4758 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4759 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4760 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4762 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4764 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4765 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4767 stat ${cname}/d1 || error "(21) d1 is not recovered"
4768 stat ${cname}/f1 || error "(22) f1 is not recovered"
4770 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4773 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4776 echo "For the name entry under a striped directory, if the name"
4777 echo "hash does not match the shard, then the LFSCK will repair"
4778 echo "the bad name entry"
4781 check_mount_and_prep
4783 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4784 error "(1) Fail to create striped directory"
4786 echo "Inject failure stub on client to simulate the case that"
4787 echo "some name entry should be inserted into other non-first"
4788 echo "shard, but inserted into the first shard by wrong"
4790 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4791 $LCTL set_param fail_loc=0x1628 fail_val=0
4792 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4793 error "(2) Fail to create file under striped directory"
4794 $LCTL set_param fail_loc=0 fail_val=0
4796 echo "Trigger namespace LFSCK to repair bad name hash"
4797 $START_NAMESPACE -r -A ||
4798 error "(3) Fail to start LFSCK for namespace"
4800 wait_all_targets_blocked namespace completed 4
4802 local repaired=$($SHOW_NAMESPACE |
4803 awk '/^name_hash_repaired/ { print $2 }')
4804 [ $repaired -ge 1 ] ||
4805 error "(5) Fail to repair bad name hash: $repaired"
4807 umount_client $MOUNT || error "(6) umount failed"
4808 mount_client $MOUNT || error "(7) mount failed"
4810 for ((i = 0; i < $MDSCOUNT; i++)); do
4811 stat $DIR/$tdir/striped_dir/d$i ||
4812 error "(8) Fail to stat d$i after LFSCK"
4813 rmdir $DIR/$tdir/striped_dir/d$i ||
4814 error "(9) Fail to unlink d$i after LFSCK"
4817 rmdir $DIR/$tdir/striped_dir ||
4818 error "(10) Fail to remove the striped directory after LFSCK"
4820 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4823 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4826 echo "For the name entry under a striped directory, if the name"
4827 echo "hash does not match the shard, then the LFSCK will repair"
4828 echo "the bad name entry"
4831 check_mount_and_prep
4833 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4834 error "(1) Fail to create striped directory"
4836 echo "Inject failure stub on client to simulate the case that"
4837 echo "some name entry should be inserted into other non-second"
4838 echo "shard, but inserted into the secod shard by wrong"
4840 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4841 $LCTL set_param fail_loc=0x1628 fail_val=1
4842 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4843 error "(2) Fail to create file under striped directory"
4844 $LCTL set_param fail_loc=0 fail_val=0
4846 echo "Trigger namespace LFSCK to repair bad name hash"
4847 $START_NAMESPACE -r -A ||
4848 error "(3) Fail to start LFSCK for namespace"
4850 wait_all_targets_blocked namespace completed 4
4852 local repaired=$(do_facet mds2 $LCTL get_param -n \
4853 mdd.$(facet_svc mds2).lfsck_namespace |
4854 awk '/^name_hash_repaired/ { print $2 }')
4855 echo "repaired $repaired name entries with bad hash"
4856 [ $repaired -ge 1 ] ||
4857 error "(5) Fail to repair bad name hash: $repaired"
4859 umount_client $MOUNT || error "(6) umount failed"
4860 mount_client $MOUNT || error "(7) mount failed"
4862 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4863 stat $DIR/$tdir/striped_dir/d$i ||
4864 error "(8) Fail to stat d$i after LFSCK"
4865 rmdir $DIR/$tdir/striped_dir/d$i ||
4866 error "(9) Fail to unlink d$i after LFSCK"
4869 rmdir $DIR/$tdir/striped_dir ||
4870 error "(10) Fail to remove the striped directory after LFSCK"
4872 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4875 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4878 echo "For some reason, the master MDT-object of the striped directory"
4879 echo "may lost its master LMV EA. If nobody created files under the"
4880 echo "master directly after the master LMV EA lost, then the LFSCK"
4881 echo "should re-generate the master LMV EA."
4884 check_mount_and_prep
4886 echo "Inject failure stub on MDT0 to simulate the case that the"
4887 echo "master MDT-object of the striped directory lost the LMV EA."
4889 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4891 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4892 error "(1) Fail to create striped directory"
4893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4895 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4896 $START_NAMESPACE -r -A ||
4897 error "(2) Fail to start LFSCK for namespace"
4899 wait_all_targets_blocked namespace completed 3
4901 local repaired=$($SHOW_NAMESPACE |
4902 awk '/^striped_dirs_repaired/ { print $2 }')
4903 [ $repaired -eq 1 ] ||
4904 error "(4) Fail to re-generate master LMV EA: $repaired"
4906 umount_client $MOUNT || error "(5) umount failed"
4907 mount_client $MOUNT || error "(6) mount failed"
4909 local empty=$(ls $DIR/$tdir/striped_dir/)
4910 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4912 rmdir $DIR/$tdir/striped_dir ||
4913 error "(8) Fail to remove the striped directory after LFSCK"
4915 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4918 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4921 echo "For some reason, the master MDT-object of the striped directory"
4922 echo "may lost its master LMV EA. If somebody created files under the"
4923 echo "master directly after the master LMV EA lost, then the LFSCK"
4924 echo "should NOT re-generate the master LMV EA, instead, it should"
4925 echo "change the broken striped dirctory as read-only to prevent"
4926 echo "further damage"
4929 check_mount_and_prep
4931 echo "Inject failure stub on MDT0 to simulate the case that the"
4932 echo "master MDT-object of the striped directory lost the LMV EA."
4934 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4935 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4936 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4937 error "(1) Fail to create striped directory"
4938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4940 umount_client $MOUNT || error "(2) umount failed"
4941 mount_client $MOUNT || error "(3) mount failed"
4943 touch $DIR/$tdir/striped_dir/dummy ||
4944 error "(4) Fail to touch under broken striped directory"
4946 echo "Trigger namespace LFSCK to find out the inconsistency"
4947 $START_NAMESPACE -r -A ||
4948 error "(5) Fail to start LFSCK for namespace"
4950 wait_all_targets_blocked namespace completed 6
4952 local repaired=$($SHOW_NAMESPACE |
4953 awk '/^striped_dirs_repaired/ { print $2 }')
4954 [ $repaired -eq 0 ] ||
4955 error "(7) Re-generate master LMV EA unexpected: $repaired"
4957 stat $DIR/$tdir/striped_dir/dummy ||
4958 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4960 touch $DIR/$tdir/striped_dir/foo &&
4961 error "(9) The broken striped directory should be read-only"
4963 chattr -i $DIR/$tdir/striped_dir ||
4964 error "(10) Fail to chattr on the broken striped directory"
4966 rmdir $DIR/$tdir/striped_dir ||
4967 error "(11) Fail to remove the striped directory after LFSCK"
4969 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4972 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4975 echo "For some reason, the slave MDT-object of the striped directory"
4976 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4977 echo "slave LMV EA."
4980 check_mount_and_prep
4982 echo "Inject failure stub on MDT0 to simulate the case that the"
4983 echo "slave MDT-object (that resides on the same MDT as the master"
4984 echo "MDT-object resides on) lost the LMV EA."
4986 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4987 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4988 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4989 error "(1) Fail to create striped directory"
4990 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4992 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4993 $START_NAMESPACE -r -A ||
4994 error "(2) Fail to start LFSCK for namespace"
4996 wait_all_targets_blocked namespace completed 3
4998 local repaired=$($SHOW_NAMESPACE |
4999 awk '/^striped_shards_repaired/ { print $2 }')
5000 [ $repaired -eq 1 ] ||
5001 error "(4) Fail to re-generate slave LMV EA: $repaired"
5003 rmdir $DIR/$tdir/striped_dir ||
5004 error "(5) Fail to remove the striped directory after LFSCK"
5006 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5009 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5012 echo "For some reason, the slave MDT-object of the striped directory"
5013 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5014 echo "slave LMV EA."
5017 check_mount_and_prep
5019 echo "Inject failure stub on MDT0 to simulate the case that the"
5020 echo "slave MDT-object (that resides on different MDT as the master"
5021 echo "MDT-object resides on) lost the LMV EA."
5023 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5024 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5025 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5026 error "(1) Fail to create striped directory"
5027 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5029 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5030 $START_NAMESPACE -r -A ||
5031 error "(2) Fail to start LFSCK for namespace"
5033 wait_all_targets_blocked namespace completed 3
5035 local repaired=$(do_facet mds2 $LCTL get_param -n \
5036 mdd.$(facet_svc mds2).lfsck_namespace |
5037 awk '/^striped_shards_repaired/ { print $2 }')
5038 [ $repaired -eq 1 ] ||
5039 error "(4) Fail to re-generate slave LMV EA: $repaired"
5041 rmdir $DIR/$tdir/striped_dir ||
5042 error "(5) Fail to remove the striped directory after LFSCK"
5044 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5047 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5050 echo "For some reason, the stripe index in the slave LMV EA is"
5051 echo "corrupted. The LFSCK should repair the slave LMV EA."
5054 check_mount_and_prep
5056 echo "Inject failure stub on MDT0 to simulate the case that the"
5057 echo "slave LMV EA on the first shard of the striped directory"
5058 echo "claims the same index as the second shard claims"
5060 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5061 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5062 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5063 error "(1) Fail to create striped directory"
5064 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5066 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5067 $START_NAMESPACE -r -A ||
5068 error "(2) Fail to start LFSCK for namespace"
5070 wait_all_targets_blocked namespace completed 3
5072 local repaired=$($SHOW_NAMESPACE |
5073 awk '/^striped_shards_repaired/ { print $2 }')
5074 [ $repaired -eq 1 ] ||
5075 error "(4) Fail to repair slave LMV EA: $repaired"
5077 umount_client $MOUNT || error "(5) umount failed"
5078 mount_client $MOUNT || error "(6) mount failed"
5080 touch $DIR/$tdir/striped_dir/foo ||
5081 error "(7) Fail to touch file after the LFSCK"
5083 rm -f $DIR/$tdir/striped_dir/foo ||
5084 error "(8) Fail to unlink file after the LFSCK"
5086 rmdir $DIR/$tdir/striped_dir ||
5087 error "(9) Fail to remove the striped directory after LFSCK"
5089 run_test 31g "Repair the corrupted slave LMV EA"
5092 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5095 echo "For some reason, the shard's name entry in the striped"
5096 echo "directory may be corrupted. The LFSCK should repair the"
5097 echo "bad shard's name entry."
5100 check_mount_and_prep
5102 echo "Inject failure stub on MDT0 to simulate the case that the"
5103 echo "first shard's name entry in the striped directory claims"
5104 echo "the same index as the second shard's name entry claims."
5106 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5108 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5109 error "(1) Fail to create striped directory"
5110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5112 echo "Trigger namespace LFSCK to repair the shard's name entry"
5113 $START_NAMESPACE -r -A ||
5114 error "(2) Fail to start LFSCK for namespace"
5116 wait_all_targets_blocked namespace completed 3
5118 local repaired=$($SHOW_NAMESPACE |
5119 awk '/^dirent_repaired/ { print $2 }')
5120 [ $repaired -eq 1 ] ||
5121 error "(4) Fail to repair shard's name entry: $repaired"
5123 umount_client $MOUNT || error "(5) umount failed"
5124 mount_client $MOUNT || error "(6) mount failed"
5126 touch $DIR/$tdir/striped_dir/foo ||
5127 error "(7) Fail to touch file after the LFSCK"
5129 rm -f $DIR/$tdir/striped_dir/foo ||
5130 error "(8) Fail to unlink file after the LFSCK"
5132 rmdir $DIR/$tdir/striped_dir ||
5133 error "(9) Fail to remove the striped directory after LFSCK"
5135 run_test 31h "Repair the corrupted shard's name entry"
5140 umount_client $MOUNT
5142 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5143 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5144 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5146 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5147 [ "$STATUS" == "scanning-phase1" ] ||
5148 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5151 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5153 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5157 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5159 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5160 error "(5) Fail to start ost1"
5162 run_test 32a "stop LFSCK when some OST failed"
5166 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5169 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5170 error "(1) Fail to create $DIR/$tdir/dp"
5171 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5172 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5173 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5174 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5175 umount_client $MOUNT
5177 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5178 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5179 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5181 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5182 mdd.${MDT_DEV}.lfsck_namespace |
5183 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5185 error "(5) unexpected status"
5189 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5195 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5197 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5198 error "(8) Fail to start MDT2"
5200 run_test 32b "stop LFSCK when some MDT failed"
5206 $START_LAYOUT --dryrun -o -r ||
5207 error "(1) Fail to start layout LFSCK"
5208 wait_all_targets_blocked layout completed 2
5210 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5211 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5212 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5214 $START_NAMESPACE -e abort -A -r ||
5215 error "(4) Fail to start namespace LFSCK"
5216 wait_all_targets_blocked namespace completed 5
5218 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5219 [ "$PARAMS" == "failout,all_targets" ] ||
5220 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5222 run_test 33 "check LFSCK paramters"
5226 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5227 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5228 skip "Only valid for ZFS backend" && return
5232 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5233 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5234 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5235 error "(1) Fail to create $DIR/$tdir/dummy"
5237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5238 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5239 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5240 mdd.${MDT_DEV}.lfsck_namespace |
5241 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5243 error "(3) unexpected status"
5246 local repaired=$($SHOW_NAMESPACE |
5247 awk '/^dirent_repaired/ { print $2 }')
5248 [ $repaired -eq 1 ] ||
5249 error "(4) Fail to repair the lost agent object: $repaired"
5251 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5252 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5253 mdd.${MDT_DEV}.lfsck_namespace |
5254 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5256 error "(6) unexpected status"
5259 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5260 [ $repaired -eq 0 ] ||
5261 error "(7) Unexpected repairing: $repaired"
5263 run_test 34 "LFSCK can rebuild the lost agent object"
5267 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5271 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5272 do_facet mds2 $LCTL set_param fail_loc=0x1631
5273 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5274 error "(1) Fail to create $DIR/$tdir/dummy"
5277 do_facet mds2 $LCTL set_param fail_loc=0
5278 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5279 wait_update_facet mds2 "$LCTL get_param -n \
5280 mdd.$(facet_svc mds2).lfsck_namespace |
5281 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5282 error "(3) MDS${k} is not the expected 'completed'"
5284 local repaired=$(do_facet mds2 $LCTL get_param -n \
5285 mdd.$(facet_svc mds2).lfsck_namespace |
5286 awk '/^agent_entries_repaired/ { print $2 }')
5287 [ $repaired -eq 1 ] ||
5288 error "(4) Fail to repair the lost agent entry: $repaired"
5290 echo "stopall to cleanup object cache"
5293 setupall > /dev/null
5295 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5296 wait_update_facet mds2 "$LCTL get_param -n \
5297 mdd.$(facet_svc mds2).lfsck_namespace |
5298 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5299 error "(6) MDS${k} is not the expected 'completed'"
5301 repaired=$(do_facet mds2 $LCTL get_param -n \
5302 mdd.$(facet_svc mds2).lfsck_namespace |
5303 awk '/^agent_entries_repaired/ { print $2 }')
5304 [ $repaired -eq 0 ] ||
5305 error "(7) Unexpected repairing: $repaired"
5307 run_test 35 "LFSCK can rebuild the lost agent entry"
5310 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5313 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5314 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5315 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5318 check_mount_and_prep
5322 lctl get_param osc.*.*grant*
5323 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5325 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5326 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5327 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5328 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5329 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5330 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5331 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5332 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5333 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5335 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5336 error "(3) Fail to write $DIR/$tdir/f0"
5337 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5338 error "(4) Fail to write $DIR/$tdir/f1"
5339 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5340 error "(5) Fail to write $DIR/$tdir/f2"
5342 $LFS mirror resync $DIR/$tdir/f0 ||
5343 error "(6) Fail to resync $DIR/$tdir/f0"
5344 $LFS mirror resync $DIR/$tdir/f1 ||
5345 error "(7) Fail to resync $DIR/$tdir/f1"
5346 $LFS mirror resync $DIR/$tdir/f2 ||
5347 error "(8) Fail to resync $DIR/$tdir/f2"
5349 cancel_lru_locks mdc
5350 cancel_lru_locks osc
5352 $LFS getstripe $DIR/$tdir/f0 ||
5353 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5354 $LFS getstripe $DIR/$tdir/f1 ||
5355 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5356 $LFS getstripe $DIR/$tdir/f2 ||
5357 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5359 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5360 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5361 do_facet mds1 $LCTL set_param fail_loc=0x1616
5363 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5364 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5365 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5366 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5367 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5368 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5372 do_facet mds1 $LCTL set_param fail_loc=0
5374 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5375 error "(15) The 1st of mirror is not destroyed"
5376 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5377 error "(16) The 2nd of mirror is not destroyed"
5378 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5379 error "(17) The 3rd of mirror is not destroyed"
5383 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5384 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5385 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5386 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5387 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5388 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5390 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5391 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5393 for k in $(seq $MDSCOUNT); do
5394 # The LFSCK status query internal is 30 seconds. For the case
5395 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5396 # time to guarantee the status sync up.
5397 wait_update_facet mds${k} "$LCTL get_param -n \
5398 mdd.$(facet_svc mds${k}).lfsck_layout |
5399 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5400 error "(22) MDS${k} is not the expected 'completed'"
5403 for k in $(seq $OSTCOUNT); do
5404 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5405 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5406 awk '/^status/ { print $2 }')
5407 [ "$cur_status" == "completed" ] ||
5408 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5411 local repaired=$(do_facet mds1 $LCTL get_param -n \
5412 mdd.$(facet_svc mds1).lfsck_layout |
5413 awk '/^repaired_orphan/ { print $2 }')
5414 [ $repaired -eq 9 ] ||
5415 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5417 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5418 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5419 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5420 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5421 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5422 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5424 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5425 $LFS getstripe $DIR/$tdir/f0
5426 error "(28) The 1st of mirror is not recovered"
5429 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5430 $LFS getstripe $DIR/$tdir/f1
5431 error "(29) The 2nd of mirror is not recovered"
5434 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5435 $LFS getstripe $DIR/$tdir/f2
5436 error "(30) The 3rd of mirror is not recovered"
5439 run_test 36a "rebuild LOV EA for mirrored file (1)"
5442 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5443 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5446 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5447 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5448 echo "with the PFID EA of related OST-object(s) belong to the file. "
5451 check_mount_and_prep
5453 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5454 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5455 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5457 local fid=$($LFS path2fid $DIR/$tdir/f0)
5459 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5460 error "(1) Fail to write $DIR/$tdir/f0"
5461 $LFS mirror resync $DIR/$tdir/f0 ||
5462 error "(2) Fail to resync $DIR/$tdir/f0"
5464 cancel_lru_locks mdc
5465 cancel_lru_locks osc
5467 $LFS getstripe $DIR/$tdir/f0 ||
5468 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5470 echo "Inject failure, to simulate the case of missing the MDT-object"
5471 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5472 do_facet mds1 $LCTL set_param fail_loc=0x1616
5473 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5477 do_facet mds1 $LCTL set_param fail_loc=0
5479 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5480 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5482 for k in $(seq $MDSCOUNT); do
5483 # The LFSCK status query internal is 30 seconds. For the case
5484 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5485 # time to guarantee the status sync up.
5486 wait_update_facet mds${k} "$LCTL get_param -n \
5487 mdd.$(facet_svc mds${k}).lfsck_layout |
5488 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5489 error "(6) MDS${k} is not the expected 'completed'"
5492 for k in $(seq $OSTCOUNT); do
5493 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5494 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5495 awk '/^status/ { print $2 }')
5496 [ "$cur_status" == "completed" ] ||
5497 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5500 local count=$(do_facet mds1 $LCTL get_param -n \
5501 mdd.$(facet_svc mds1).lfsck_layout |
5502 awk '/^repaired_orphan/ { print $2 }')
5503 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5505 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5506 count=$($LFS getstripe --mirror-count $name)
5507 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5509 count=$($LFS getstripe --component-count $name)
5510 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5512 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5513 $LFS getstripe $name
5514 error "(11) The 1st of mirror is not recovered"
5517 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5518 $LFS getstripe $name
5519 error "(12) The 2nd of mirror is not recovered"
5522 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5523 $LFS getstripe $name
5524 error "(13) The 3rd of mirror is not recovered"
5527 run_test 36b "rebuild LOV EA for mirrored file (2)"
5530 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5531 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5534 echo "The mirrored file has been modified, not resynced yet, then "
5535 echo "lost its MDT-object, but relatd OST-objects are still there. "
5536 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5537 echo "with the PFID EA of related OST-object(s) belong to the file. "
5540 check_mount_and_prep
5542 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5544 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5546 local fid=$($LFS path2fid $DIR/$tdir/f0)
5548 # The 1st dd && resync makes all related OST-objects have been written
5549 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5550 error "(1.1) Fail to write $DIR/$tdir/f0"
5551 $LFS mirror resync $DIR/$tdir/f0 ||
5552 error "(1.2) Fail to resync $DIR/$tdir/f0"
5553 # The 2nd dd makes one mirror to be stale
5554 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5555 error "(1.3) Fail to write $DIR/$tdir/f0"
5557 cancel_lru_locks mdc
5558 cancel_lru_locks osc
5560 $LFS getstripe $DIR/$tdir/f0 ||
5561 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5563 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5564 awk '/lcme_flags/ { print $2 }')
5565 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5566 awk '/lcme_flags/ { print $2 }')
5568 echo "Inject failure, to simulate the case of missing the MDT-object"
5569 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5570 do_facet mds1 $LCTL set_param fail_loc=0x1616
5571 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5575 do_facet mds1 $LCTL set_param fail_loc=0
5577 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5578 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5580 for k in $(seq $MDSCOUNT); do
5581 # The LFSCK status query internal is 30 seconds. For the case
5582 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5583 # time to guarantee the status sync up.
5584 wait_update_facet mds${k} "$LCTL get_param -n \
5585 mdd.$(facet_svc mds${k}).lfsck_layout |
5586 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5587 error "(5) MDS${k} is not the expected 'completed'"
5590 for k in $(seq $OSTCOUNT); do
5591 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5592 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5593 awk '/^status/ { print $2 }')
5594 [ "$cur_status" == "completed" ] ||
5595 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5598 local count=$(do_facet mds1 $LCTL get_param -n \
5599 mdd.$(facet_svc mds1).lfsck_layout |
5600 awk '/^repaired_orphan/ { print $2 }')
5601 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5603 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5604 count=$($LFS getstripe --mirror-count $name)
5605 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5607 count=$($LFS getstripe --component-count $name)
5608 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5610 local flags=$($LFS getstripe $name | head -n 10 |
5611 awk '/lcme_flags/ { print $2 }')
5612 [ "$flags" == "$saved_flags1" ] || {
5613 $LFS getstripe $name
5614 error "(10) expect flags $saved_flags1, got $flags"
5617 flags=$($LFS getstripe $name | tail -n 10 |
5618 awk '/lcme_flags/ { print $2 }')
5619 [ "$flags" == "$saved_flags2" ] || {
5620 $LFS getstripe $name
5621 error "(11) expect flags $saved_flags2, got $flags"
5624 run_test 36c "rebuild LOV EA for mirrored file (3)"
5630 local t_dir="$DIR/$tdir/d0"
5631 check_mount_and_prep
5633 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5634 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5638 $START_NAMESPACE -r -A || {
5639 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5641 wait_all_targets_blocked namespace completed 4
5646 run_test 37 "LFSCK must skip a ORPHAN"
5650 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5651 skip "Need MDS version newer than 2.12.51"
5653 test_mkdir $DIR/$tdir
5654 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5655 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5657 # create foreign file
5658 $LFS setstripe --foreign=daos --flags 0xda05 \
5659 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5660 error "$DIR/$tdir/$tfile: create failed"
5662 $LFS getstripe -v $DIR/$tdir/$tfile |
5663 grep "lfm_magic:.*0x0BD70BD0" ||
5664 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5665 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5666 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5667 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5668 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5669 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5670 $LFS getstripe -v $DIR/$tdir/$tfile |
5671 grep "lfm_flags:.*0x0000DA05" ||
5672 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5673 $LFS getstripe $DIR/$tdir/$tfile |
5674 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5675 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5677 # modify striping should fail
5678 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5679 error "$DIR/$tdir/$tfile: setstripe should fail"
5681 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5683 wait_all_targets_blocked namespace completed 1
5685 # check that "global" namespace_repaired == 0 !!!
5686 local repaired=$(do_facet mds1 \
5687 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5688 awk '/^namespace_repaired/ { print \\\$2 }'")
5689 [ $repaired -eq 0 ] ||
5690 error "(2) Expect no namespace repair, but got: $repaired"
5692 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5694 wait_all_targets_blocked layout completed 2
5696 # check that "global" layout_repaired == 0 !!!
5697 local repaired=$(do_facet mds1 \
5698 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5699 awk '/^layout_repaired/ { print \\\$2 }'")
5700 [ $repaired -eq 0 ] ||
5701 error "(2) Expect no layout repair, but got: $repaired"
5703 echo "post-lfsck checks of foreign file"
5705 $LFS getstripe -v $DIR/$tdir/$tfile |
5706 grep "lfm_magic:.*0x0BD70BD0" ||
5707 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5708 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5709 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5710 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5711 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5712 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5713 $LFS getstripe -v $DIR/$tdir/$tfile |
5714 grep "lfm_flags:.*0x0000DA05" ||
5715 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5716 $LFS getstripe $DIR/$tdir/$tfile |
5717 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5718 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5720 # modify striping should fail
5721 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5722 error "$DIR/$tdir/$tfile: setstripe should fail"
5725 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5726 cat /etc/passwd > $DIR/$tdir/$tfile &&
5727 error "$DIR/$tdir/$tfile: write should fail"
5729 #remove foreign file
5730 rm $DIR/$tdir/$tfile ||
5731 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5733 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5737 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5738 skip "Need MDS version newer than 2.12.51"
5740 test_mkdir $DIR/$tdir
5741 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5742 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5744 # create foreign dir
5745 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5746 $DIR/$tdir/${tdir}2 ||
5747 error "$DIR/$tdir/${tdir}2: create failed"
5749 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5750 grep "lfm_magic:.*0x0CD50CD0" ||
5751 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5752 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5753 # - sizeof(lfm_type) - sizeof(lfm_flags)
5754 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5755 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5756 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5757 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5758 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5759 grep "lfm_flags:.*0x0000DA05" ||
5760 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5761 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5762 grep "lfm_value.*${uuid1}@${uuid2}" ||
5763 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5765 # file create in dir should fail
5766 touch $DIR/$tdir/${tdir}2/$tfile &&
5767 "$DIR/${tdir}2: file create should fail"
5770 chmod 777 $DIR/$tdir/${tdir}2 ||
5771 error "$DIR/${tdir}2: chmod failed"
5774 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5775 error "$DIR/${tdir}2: chown failed"
5777 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5779 wait_all_targets_blocked namespace completed 1
5781 # check that "global" namespace_repaired == 0 !!!
5782 local repaired=$(do_facet mds1 \
5783 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5784 awk '/^namespace_repaired/ { print \\\$2 }'")
5785 [ $repaired -eq 0 ] ||
5786 error "(2) Expect nothing to be repaired, but got: $repaired"
5788 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5790 wait_all_targets_blocked layout completed 2
5792 # check that "global" layout_repaired == 0 !!!
5793 local repaired=$(do_facet mds1 \
5794 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5795 awk '/^layout_repaired/ { print \\\$2 }'")
5796 [ $repaired -eq 0 ] ||
5797 error "(2) Expect no layout repair, but got: $repaired"
5799 echo "post-lfsck checks of foreign dir"
5801 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5802 grep "lfm_magic:.*0x0CD50CD0" ||
5803 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5804 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5805 # - sizeof(lfm_type) - sizeof(lfm_flags)
5806 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5807 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5808 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5809 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5810 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5811 grep "lfm_flags:.*0x0000DA05" ||
5812 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5813 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5814 grep "lfm_value.*${uuid1}@${uuid2}" ||
5815 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5817 # file create in dir should fail
5818 touch $DIR/$tdir/${tdir}2/$tfile &&
5819 "$DIR/${tdir}2: file create should fail"
5822 chmod 777 $DIR/$tdir/${tdir}2 ||
5823 error "$DIR/${tdir}2: chmod failed"
5826 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5827 error "$DIR/${tdir}2: chown failed"
5830 rmdir $DIR/$tdir/${tdir}2 ||
5831 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5833 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5836 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5838 check_mount_and_prep
5839 $LFS mkdir -i 1 $DIR/$tdir/dir1
5840 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5842 touch $DIR/$tdir/dir1/f1
5843 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5845 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5846 $LFS migrate -m 0 $DIR/$tdir/dir1
5848 echo "trigger LFSCK for layout"
5849 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5851 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5852 mdd.${MDT_DEV}.lfsck_layout |
5853 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5855 error "(2) unexpected status"
5858 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5860 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5862 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5864 # restore MDS/OST size
5865 MDSSIZE=${SAVED_MDSSIZE}
5866 OSTSIZE=${SAVED_OSTSIZE}
5867 OSTCOUNT=${SAVED_OSTCOUNT}
5869 # cleanup the system at last
5870 REFORMAT="yes" cleanup_and_setup_lustre
5873 check_and_cleanup_lustre