3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # DNE does not support striped directory on zfs-based backend yet.
19 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
20 #Bug number for excepting test
22 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
24 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
25 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
27 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
28 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
30 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
31 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
33 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
36 require_dsh_mds || exit 0
40 if ! check_versions; then
41 skip "It is NOT necessary to test lfsck under interoperation mode"
45 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
46 skip "Need MDS version at least 2.3.60" && exit 0
50 SAVED_MDSSIZE=${MDSSIZE}
51 SAVED_OSTSIZE=${OSTSIZE}
52 SAVED_OSTCOUNT=${OSTCOUNT}
53 # use small MDS + OST size to speed formatting time
54 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
56 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
58 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
60 # no need too many OSTs, to reduce the format/start/stop overhead
62 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
64 # build up a clean test environment.
65 REFORMAT="yes" check_and_setup_lustre
67 MDT_DEV="${FSNAME}-MDT0000"
68 OST_DEV="${FSNAME}-OST0000"
69 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
70 START_NAMESPACE="do_facet $SINGLEMDS \
71 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
72 START_LAYOUT="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
74 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
75 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
76 SHOW_NAMESPACE="do_facet $SINGLEMDS \
77 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
78 SHOW_LAYOUT="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
80 SHOW_LAYOUT_ON_OST="do_facet ost1 \
81 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
82 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
83 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
84 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
93 echo "preparing... $nfiles * $ndirs files will be created $(date)."
94 if [ ! -z $igif ]; then
95 #define OBD_FAIL_FID_IGIF 0x1504
96 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
99 cp $LUSTRE/tests/*.sh $DIR/$tdir/
100 if [ $ndirs -gt 0 ]; then
101 createmany -d $DIR/$tdir/d $ndirs
102 createmany -m $DIR/$tdir/f $ndirs
103 if [ $nfiles -gt 0 ]; then
104 for ((i = 0; i < $ndirs; i++)); do
105 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
106 /dev/null || error "createmany $nfiles"
109 createmany -d $DIR/$tdir/e $ndirs
112 if [ ! -z $igif ]; then
113 touch $DIR/$tdir/dummy
114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
117 echo "prepared $(date)."
120 run_e2fsck_on_mdt0() {
121 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
123 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
124 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
127 error "(2) Detected inconsistency on MDT0"
129 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
130 error "(3) Fail to start MDT0"
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS" && return
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
345 touch $DIR/$tdir/dummy
347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
349 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
350 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
351 mdd.${MDT_DEV}.lfsck_namespace |
352 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
354 error "(4) unexpected status"
357 local repaired=$($SHOW_NAMESPACE |
358 awk '/^linkea_repaired/ { print $2 }')
359 # for interop with old server
360 [ -z "$repaired" ] &&
361 repaired=$($SHOW_NAMESPACE |
362 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
422 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^updated_phase2/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
459 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
460 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
461 touch $DIR/$tdir/dummy
463 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
465 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
466 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
467 mdd.${MDT_DEV}.lfsck_namespace |
468 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
470 error "(4) unexpected status"
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
480 mount_client $MOUNT || error "(6) Fail to start client!"
482 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
483 error "(7) Fail to stat $DIR/$tdir/dummy"
485 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
486 local dummyname=$($LFS fid2path $DIR $dummyfid)
487 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
488 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
490 run_test 2d "LFSCK can recover the missing linkEA entry"
494 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
498 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
500 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
502 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
505 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
507 wait_all_targets_blocked namespace completed 4
509 local repaired=$($SHOW_NAMESPACE |
510 awk '/^linkea_repaired/ { print $2 }')
511 [ $repaired -eq 1 ] ||
512 error "(5) Fail to repair crashed linkEA: $repaired"
514 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
515 local name=$($LFS fid2path $DIR $fid)
516 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
517 error "(6) Fail to repair linkEA: $fid $name"
519 run_test 2e "namespace LFSCK can verify remote object linkEA"
525 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
526 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
527 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
529 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
530 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
531 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
533 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
534 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
535 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
537 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
539 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
543 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
548 error "(10) unexpected status"
551 local checked=$($SHOW_NAMESPACE |
552 awk '/^checked_phase2/ { print $2 }')
553 [ $checked -ge 4 ] ||
554 error "(11) Fail to check multiple-linked object: $checked"
556 local repaired=$($SHOW_NAMESPACE |
557 awk '/^multiple_linked_repaired/ { print $2 }')
558 [ $repaired -ge 2 ] ||
559 error "(12) Fail to repair multiple-linked object: $repaired"
561 run_test 3 "LFSCK can verify multiple-linked objects"
565 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
566 skip "OI Scrub not implemented for ZFS" && return
569 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
570 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
572 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
573 echo "start $SINGLEMDS with disabling OI scrub"
574 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
575 error "(2) Fail to start MDS!"
577 #define OBD_FAIL_LFSCK_DELAY2 0x1601
578 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
579 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
580 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
581 mdd.${MDT_DEV}.lfsck_namespace |
582 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
584 error "(5) unexpected status"
587 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
588 [ "$STATUS" == "scanning-phase1" ] ||
589 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
593 mdd.${MDT_DEV}.lfsck_namespace |
594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
596 error "(7) unexpected status"
599 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
600 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
602 local repaired=$($SHOW_NAMESPACE |
603 awk '/^dirent_repaired/ { print $2 }')
604 # for interop with old server
605 [ -z "$repaired" ] &&
606 repaired=$($SHOW_NAMESPACE |
607 awk '/^updated_phase1/ { print $2 }')
609 [ $repaired -ge 9 ] ||
610 error "(9) Fail to re-generate FID-in-dirent: $repaired"
614 mount_client $MOUNT || error "(10) Fail to start client!"
616 #define OBD_FAIL_FID_LOOKUP 0x1505
617 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
618 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
621 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
625 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
626 skip "OI Scrub not implemented for ZFS" && return
629 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
630 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
632 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
633 echo "start $SINGLEMDS with disabling OI scrub"
634 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
635 error "(2) Fail to start MDS!"
637 #define OBD_FAIL_LFSCK_DELAY2 0x1601
638 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
639 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
640 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
641 mdd.${MDT_DEV}.lfsck_namespace |
642 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
644 error "(5) unexpected status"
647 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
648 [ "$STATUS" == "scanning-phase1" ] ||
649 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
651 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
652 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
653 mdd.${MDT_DEV}.lfsck_namespace |
654 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
656 error "(7) unexpected status"
659 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
660 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
662 local repaired=$($SHOW_NAMESPACE |
663 awk '/^dirent_repaired/ { print $2 }')
664 # for interop with old server
665 [ -z "$repaired" ] &&
666 repaired=$($SHOW_NAMESPACE |
667 awk '/^updated_phase1/ { print $2 }')
669 [ $repaired -ge 2 ] ||
670 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
674 mount_client $MOUNT || error "(10) Fail to start client!"
676 #define OBD_FAIL_FID_LOOKUP 0x1505
677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
678 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
680 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
684 local dummyname=$($LFS fid2path $DIR $dummyfid)
685 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
686 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
688 run_test 5 "LFSCK can handle IGIF object upgrading"
693 #define OBD_FAIL_LFSCK_DELAY1 0x1600
694 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
695 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
697 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
698 [ "$STATUS" == "scanning-phase1" ] ||
699 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
701 # Sleep 3 sec to guarantee at least one object processed by LFSCK
703 # Fail the LFSCK to guarantee there is at least one checkpoint
704 #define OBD_FAIL_LFSCK_FATAL1 0x1608
705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
707 mdd.${MDT_DEV}.lfsck_namespace |
708 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
710 error "(4) unexpected status"
713 local POS0=$($SHOW_NAMESPACE |
714 awk '/^last_checkpoint_position/ { print $2 }' |
717 #define OBD_FAIL_LFSCK_DELAY1 0x1600
718 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
719 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
721 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
722 [ "$STATUS" == "scanning-phase1" ] ||
723 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
725 local POS1=$($SHOW_NAMESPACE |
726 awk '/^latest_start_position/ { print $2 }' |
728 [[ $POS0 -lt $POS1 ]] ||
729 error "(7) Expect larger than: $POS0, but got $POS1"
731 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
732 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
733 mdd.${MDT_DEV}.lfsck_namespace |
734 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
736 error "(8) unexpected status"
739 run_test 6a "LFSCK resumes from last checkpoint (1)"
744 #define OBD_FAIL_LFSCK_DELAY2 0x1601
745 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
746 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
748 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
749 [ "$STATUS" == "scanning-phase1" ] ||
750 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
752 # Sleep 5 sec to guarantee that we are in the directory scanning
754 # Fail the LFSCK to guarantee there is at least one checkpoint
755 #define OBD_FAIL_LFSCK_FATAL2 0x1609
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
761 error "(4) unexpected status"
764 local O_POS0=$($SHOW_NAMESPACE |
765 awk '/^last_checkpoint_position/ { print $2 }' |
768 local D_POS0=$($SHOW_NAMESPACE |
769 awk '/^last_checkpoint_position/ { print $4 }')
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
775 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
779 local O_POS1=$($SHOW_NAMESPACE |
780 awk '/^latest_start_position/ { print $2 }' |
782 local D_POS1=$($SHOW_NAMESPACE |
783 awk '/^latest_start_position/ { print $4 }')
785 echo "Additional debug for 6b"
787 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
788 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
789 [[ $O_POS0 -lt $O_POS1 ]] ||
790 error "(7.1) $O_POS1 is not larger than $O_POS0"
792 [[ $D_POS0 -lt $D_POS1 ]] ||
793 error "(7.2) $D_POS1 is not larger than $D_POS0"
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
797 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
798 mdd.${MDT_DEV}.lfsck_namespace |
799 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
801 error "(8) unexpected status"
804 run_test 6b "LFSCK resumes from last checkpoint (2)"
811 #define OBD_FAIL_LFSCK_DELAY2 0x1601
812 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
813 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
815 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
816 [ "$STATUS" == "scanning-phase1" ] ||
817 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
819 # Sleep 3 sec to guarantee at least one object processed by LFSCK
821 echo "stop $SINGLEMDS"
822 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
825 echo "start $SINGLEMDS"
826 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
827 error "(5) Fail to start MDS!"
829 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
830 mdd.${MDT_DEV}.lfsck_namespace |
831 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
833 error "(6) unexpected status"
836 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
842 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
844 for ((i = 0; i < 20; i++)); do
845 touch $DIR/$tdir/dummy${i}
848 #define OBD_FAIL_LFSCK_DELAY3 0x1602
849 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
850 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
851 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
852 mdd.${MDT_DEV}.lfsck_namespace |
853 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
855 error "(4) unexpected status"
859 echo "stop $SINGLEMDS"
860 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
862 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
863 echo "start $SINGLEMDS"
864 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
865 error "(6) Fail to start MDS!"
867 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
868 mdd.${MDT_DEV}.lfsck_namespace |
869 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
871 error "(7) unexpected status"
874 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
879 formatall > /dev/null
885 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
886 [ "$STATUS" == "init" ] ||
887 error "(2) Expect 'init', but got '$STATUS'"
889 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
891 mkdir $DIR/$tdir/crashed
893 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
895 for ((i = 0; i < 5; i++)); do
896 touch $DIR/$tdir/dummy${i}
899 umount_client $MOUNT || error "(3) Fail to stop client!"
901 #define OBD_FAIL_LFSCK_DELAY2 0x1601
902 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
903 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
905 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
906 [ "$STATUS" == "scanning-phase1" ] ||
907 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
909 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
911 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
912 [ "$STATUS" == "stopped" ] ||
913 error "(7) Expect 'stopped', but got '$STATUS'"
915 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
917 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
918 [ "$STATUS" == "scanning-phase1" ] ||
919 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
921 #define OBD_FAIL_LFSCK_FATAL2 0x1609
922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
927 error "(10) unexpected status"
930 #define OBD_FAIL_LFSCK_DELAY1 0x1600
931 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
932 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
934 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
935 [ "$STATUS" == "scanning-phase1" ] ||
936 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
938 #define OBD_FAIL_LFSCK_CRASH 0x160a
939 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
942 echo "stop $SINGLEMDS"
943 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
945 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
948 echo "start $SINGLEMDS"
949 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
950 error "(14) Fail to start MDS!"
952 local timeout=$(max_recovery_time)
955 while [ $timer -lt $timeout ]; do
956 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
957 mdt.${MDT_DEV}.recovery_status |
958 awk '/^status/ { print \\\$2 }'")
959 [ "$STATUS" != "RECOVERING" ] && break;
964 [ $timer != $timeout ] ||
965 error "(14.1) recovery timeout"
967 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
968 [ "$STATUS" == "crashed" ] ||
969 error "(15) Expect 'crashed', but got '$STATUS'"
971 #define OBD_FAIL_LFSCK_DELAY2 0x1601
972 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
973 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
975 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
976 [ "$STATUS" == "scanning-phase1" ] ||
977 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
979 echo "stop $SINGLEMDS"
980 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
982 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
985 echo "start $SINGLEMDS"
986 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
987 error "(19) Fail to start MDS!"
990 while [ $timer -lt $timeout ]; do
991 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
992 mdt.${MDT_DEV}.recovery_status |
993 awk '/^status/ { print \\\$2 }'")
994 [ "$STATUS" != "RECOVERING" ] && break;
999 [ $timer != $timeout ] ||
1000 error "(19.1) recovery timeout"
1002 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1003 [ "$STATUS" == "paused" ] ||
1004 error "(20) Expect 'paused', but got '$STATUS'"
1006 echo "stop $SINGLEMDS"
1007 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1009 echo "start $SINGLEMDS without resume LFSCK"
1010 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1011 error "(20.2) Fail to start MDS!"
1014 while [ $timer -lt $timeout ]; do
1015 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1016 mdt.${MDT_DEV}.recovery_status |
1017 awk '/^status/ { print \\\$2 }'")
1018 [ "$STATUS" != "RECOVERING" ] && break;
1020 timer=$((timer + 1))
1023 [ $timer != $timeout ] ||
1024 error "(20.3) recovery timeout"
1026 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1027 [ "$STATUS" == "paused" ] ||
1028 error "(20.4) Expect 'paused', but got '$STATUS'"
1030 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1031 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1033 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1034 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1035 mdd.${MDT_DEV}.lfsck_namespace |
1036 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1038 error "(22) unexpected status"
1041 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1042 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1043 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1045 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1046 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1047 mdd.${MDT_DEV}.lfsck_namespace |
1048 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1050 error "(24) unexpected status"
1053 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1054 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1056 run_test 8 "LFSCK state machine"
1059 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1060 skip "Testing on UP system, the speed may be inaccurate."
1064 check_mount_and_prep
1065 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1066 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1067 createmany -o $DIR/$tdir/lfsck/f 5000
1069 local BASE_SPEED1=100
1071 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1074 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1075 [ "$STATUS" == "scanning-phase1" ] ||
1076 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1078 local SPEED=$($SHOW_LAYOUT |
1079 awk '/^average_speed_phase1/ { print $2 }')
1081 # There may be time error, normally it should be less than 2 seconds.
1082 # We allow another 20% schedule error.
1084 # MAX_MARGIN = 1.3 = 13 / 10
1085 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1086 RUN_TIME1 * 13 / 10))
1087 [ $SPEED -lt $MAX_SPEED ] || {
1089 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1090 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1093 # adjust speed limit
1094 local BASE_SPEED2=300
1096 do_facet $SINGLEMDS \
1097 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1100 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1101 # MIN_MARGIN = 0.7 = 7 / 10
1102 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1103 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1104 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1105 [ $SPEED -gt $MIN_SPEED ] || {
1106 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1107 error_ignore LU-5624 \
1108 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1111 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1115 # MAX_MARGIN = 1.3 = 13 / 10
1116 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1117 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1118 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1119 [ $SPEED -lt $MAX_SPEED ] || {
1121 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1122 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1123 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1126 do_nodes $(comma_list $(mdts_nodes)) \
1127 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1128 do_nodes $(comma_list $(osts_nodes)) \
1129 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1131 wait_update_facet $SINGLEMDS \
1132 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1133 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1134 error "(7) Failed to get expected 'completed'"
1136 run_test 9a "LFSCK speed control (1)"
1139 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1140 skip "Testing on UP system, the speed may be inaccurate."
1146 echo "Preparing another 50 * 50 files (with error) at $(date)."
1147 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1148 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1149 createmany -d $DIR/$tdir/d 50
1150 createmany -m $DIR/$tdir/f 50
1151 for ((i = 0; i < 50; i++)); do
1152 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1155 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1157 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1158 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1159 mdd.${MDT_DEV}.lfsck_namespace |
1160 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1162 error "(5) unexpected status"
1165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1166 echo "Prepared at $(date)."
1168 local BASE_SPEED1=50
1170 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1173 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1174 [ "$STATUS" == "scanning-phase2" ] ||
1175 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1177 local SPEED=$($SHOW_NAMESPACE |
1178 awk '/^average_speed_phase2/ { print $2 }')
1179 # There may be time error, normally it should be less than 2 seconds.
1180 # We allow another 20% schedule error.
1182 # MAX_MARGIN = 1.3 = 13 / 10
1183 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1184 RUN_TIME1 * 13 / 10))
1185 [ $SPEED -lt $MAX_SPEED ] || {
1187 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1188 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1191 # adjust speed limit
1192 local BASE_SPEED2=150
1194 do_facet $SINGLEMDS \
1195 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1198 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1199 # MIN_MARGIN = 0.7 = 7 / 10
1200 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1201 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1202 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1203 [ $SPEED -gt $MIN_SPEED ] || {
1204 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1205 error_ignore LU-5624 \
1206 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1209 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1213 # MAX_MARGIN = 1.3 = 13 / 10
1214 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1215 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1216 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1217 [ $SPEED -lt $MAX_SPEED ] || {
1219 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1220 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1221 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1224 do_nodes $(comma_list $(mdts_nodes)) \
1225 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1226 do_nodes $(comma_list $(osts_nodes)) \
1227 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1229 mdd.${MDT_DEV}.lfsck_namespace |
1230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1232 error "(11) unexpected status"
1235 run_test 9b "LFSCK speed control (2)"
1239 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1240 skip "lookup(..)/linkea on ZFS issue" && return
1244 echo "Preparing more files with error at $(date)."
1245 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1246 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1248 for ((i = 0; i < 1000; i = $((i+2)))); do
1249 mkdir -p $DIR/$tdir/d${i}
1250 touch $DIR/$tdir/f${i}
1251 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1254 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1257 for ((i = 1; i < 1000; i = $((i+2)))); do
1258 mkdir -p $DIR/$tdir/d${i}
1259 touch $DIR/$tdir/f${i}
1260 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1264 echo "Prepared at $(date)."
1266 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1268 umount_client $MOUNT
1269 mount_client $MOUNT || error "(3) Fail to start client!"
1271 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1274 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1275 [ "$STATUS" == "scanning-phase1" ] ||
1276 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1278 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1280 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1282 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1284 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1286 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1288 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1290 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1292 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1293 error "(14) Fail to softlink!"
1295 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1296 [ "$STATUS" == "scanning-phase1" ] ||
1297 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1299 do_nodes $(comma_list $(mdts_nodes)) \
1300 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1301 do_nodes $(comma_list $(osts_nodes)) \
1302 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1303 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1304 mdd.${MDT_DEV}.lfsck_namespace |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(16) unexpected status"
1310 run_test 10 "System is available during LFSCK scanning"
1313 ost_remove_lastid() {
1316 local rcmd="do_facet ost${ost}"
1318 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1320 # step 1: local mount
1321 mount_fstype ost${ost} || return 1
1322 # step 2: remove the specified LAST_ID
1323 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1325 unmount_fstype ost${ost} || return 2
1329 check_mount_and_prep
1330 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1331 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1336 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1338 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1339 error "(2) Fail to start ost1"
1341 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1342 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1344 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1345 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1347 wait_update_facet ost1 "$LCTL get_param -n \
1348 obdfilter.${OST_DEV}.lfsck_layout |
1349 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1351 error "(5) unexpected status"
1354 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1356 wait_update_facet ost1 "$LCTL get_param -n \
1357 obdfilter.${OST_DEV}.lfsck_layout |
1358 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1360 error "(6) unexpected status"
1363 echo "the LAST_ID(s) should have been rebuilt"
1364 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1365 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1367 run_test 11a "LFSCK can rebuild lost last_id"
1370 check_mount_and_prep
1371 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1373 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1374 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1375 do_facet ost1 $LCTL set_param fail_loc=0x160d
1377 local count=$(precreated_ost_obj_count 0 0)
1379 createmany -o $DIR/$tdir/f $((count + 32))
1381 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1382 local seq=$(do_facet mds1 $LCTL get_param -n \
1383 osp.${proc_path}.prealloc_last_seq)
1384 local id_used=$(do_facet mds1 $LCTL get_param -n \
1385 osp.${proc_path}.prealloc_last_id)
1387 umount_client $MOUNT
1388 stop ost1 || error "(1) Fail to stop ost1"
1390 #define OBD_FAIL_OST_ENOSPC 0x215
1391 do_facet ost1 $LCTL set_param fail_loc=0x215
1393 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1394 error "(2) Fail to start ost1"
1396 for ((i = 0; i < 60; i++)); do
1397 id_ost1=$(do_facet ost1 \
1398 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1399 awk -F: "/$seq/ { print \$2 }")
1400 [ -n "$id_ost1" ] && break
1404 echo "the on-disk LAST_ID should be smaller than the expected one"
1405 [ $id_used -gt $id_ost1 ] ||
1406 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1408 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1409 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1411 wait_update_facet ost1 \
1412 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1415 error "(6) unexpected status"
1418 stop ost1 || error "(7) Fail to stop ost1"
1420 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1421 error "(8) Fail to start ost1"
1423 echo "the on-disk LAST_ID should have been rebuilt"
1424 # last_id may be larger than $id_used if objects were created/skipped
1425 wait_update_facet_cond ost1 \
1426 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1427 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1428 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1429 error "(9) expect last_id >= id_used $seq:$id_used"
1432 do_facet ost1 $LCTL set_param fail_loc=0
1433 stopall || error "(10) Fail to stopall"
1435 run_test 11b "LFSCK can rebuild crashed last_id"
1438 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1440 check_mount_and_prep
1441 for k in $(seq $MDSCOUNT); do
1442 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1443 createmany -o $DIR/$tdir/${k}/f 100 ||
1444 error "(0) Fail to create 100 files."
1447 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1449 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1452 wait_all_targets namespace scanning-phase1 3
1454 echo "Stop namespace LFSCK on all targets by single lctl command."
1455 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1456 error "(4) Fail to stop LFSCK on all devices!"
1458 echo "All the LFSCK targets should be in 'stopped' status."
1459 wait_all_targets_blocked namespace stopped 5
1461 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1462 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1463 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1465 echo "All the LFSCK targets should be in 'completed' status."
1466 wait_all_targets_blocked namespace completed 7
1468 start_full_debug_logging
1470 echo "Start layout LFSCK on all targets by single command (-s 1)."
1471 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1472 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1474 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1475 wait_all_targets layout scanning-phase1 9
1477 echo "Stop layout LFSCK on all targets by single lctl command."
1478 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1479 error "(10) Fail to stop LFSCK on all devices!"
1481 echo "All the LFSCK targets should be in 'stopped' status."
1482 wait_all_targets_blocked layout stopped 11
1484 for k in $(seq $OSTCOUNT); do
1485 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1486 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1487 awk '/^status/ { print $2 }')
1488 [ "$STATUS" == "stopped" ] ||
1489 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1492 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1493 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1494 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1496 echo "All the LFSCK targets should be in 'completed' status."
1497 wait_all_targets_blocked layout completed 14
1499 stop_full_debug_logging
1501 run_test 12a "single command to trigger LFSCK on all devices"
1504 check_mount_and_prep
1506 echo "Start LFSCK without '-M' specified."
1507 do_facet mds1 $LCTL lfsck_start -A -r ||
1508 error "(0) Fail to start LFSCK without '-M'"
1510 wait_all_targets_blocked namespace completed 1
1511 wait_all_targets_blocked layout completed 2
1513 local count=$(do_facet mds1 $LCTL dl |
1514 awk '{ print $3 }' | grep mdt | wc -l)
1515 if [ $count -gt 1 ]; then
1517 echo "Start layout LFSCK on the node with multipe targets,"
1518 echo "but not specify '-M'/'-A' option. Should get failure."
1520 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1521 error "(3) Start layout LFSCK should fail" || true
1524 run_test 12b "auto detect Lustre device"
1528 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1529 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1530 echo "MDT-object FID."
1533 check_mount_and_prep
1535 echo "Inject failure stub to simulate bad lmm_oi"
1536 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1537 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1538 createmany -o $DIR/$tdir/f 1
1539 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1540 error "(0) Fail to create PFL $DIR/$tdir/f1"
1541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1543 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1544 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1547 mdd.${MDT_DEV}.lfsck_layout |
1548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1550 error "(2) unexpected status"
1553 local repaired=$($SHOW_LAYOUT |
1554 awk '/^repaired_others/ { print $2 }')
1555 [ $repaired -eq 2 ] ||
1556 error "(3) Fail to repair crashed lmm_oi: $repaired"
1558 run_test 13 "LFSCK can repair crashed lmm_oi"
1562 echo "The OST-object referenced by the MDT-object should be there;"
1563 echo "otherwise, the LFSCK should re-create the missing OST-object."
1564 echo "without '--delay-create-ostobj' option."
1567 check_mount_and_prep
1568 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1570 echo "Inject failure stub to simulate dangling referenced MDT-object"
1571 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1572 do_facet ost1 $LCTL set_param fail_loc=0x1610
1573 local count=$(precreated_ost_obj_count 0 0)
1575 createmany -o $DIR/$tdir/f $((count + 16)) ||
1576 error "(0.1) Fail to create $DIR/$tdir/fx"
1577 touch $DIR/$tdir/guard0
1579 for ((i = 0; i < 16; i++)); do
1580 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1581 $DIR/$tdir/f_comp${i} ||
1582 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1584 touch $DIR/$tdir/guard1
1586 do_facet ost1 $LCTL set_param fail_loc=0
1588 start_full_debug_logging
1590 # exhaust other pre-created dangling cases
1591 count=$(precreated_ost_obj_count 0 0)
1592 createmany -o $DIR/$tdir/a $count ||
1593 error "(0.5) Fail to create $count files."
1595 echo "'ls' should fail because of dangling referenced MDT-object"
1596 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1598 echo "Trigger layout LFSCK to find out dangling reference"
1599 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1601 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1602 mdd.${MDT_DEV}.lfsck_layout |
1603 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1605 error "(3) unexpected status"
1608 local repaired=$($SHOW_LAYOUT |
1609 awk '/^repaired_dangling/ { print $2 }')
1610 [ $repaired -ge 32 ] ||
1611 error "(4) Fail to repair dangling reference: $repaired"
1613 echo "'stat' should fail because of not repair dangling by default"
1614 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1615 error "(5.1) stat should fail"
1616 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1617 error "(5.2) stat should fail"
1619 echo "Trigger layout LFSCK to repair dangling reference"
1620 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1622 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1623 mdd.${MDT_DEV}.lfsck_layout |
1624 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1626 error "(7) unexpected status"
1629 # There may be some async LFSCK updates in processing, wait for
1630 # a while until the target reparation has been done. LU-4970.
1632 echo "'stat' should success after layout LFSCK repairing"
1633 wait_update_facet client "stat $DIR/$tdir/guard0 |
1634 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1635 stat $DIR/$tdir/guard0
1637 error "(8.1) unexpected size"
1640 wait_update_facet client "stat $DIR/$tdir/guard1 |
1641 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1642 stat $DIR/$tdir/guard1
1644 error "(8.2) unexpected size"
1647 repaired=$($SHOW_LAYOUT |
1648 awk '/^repaired_dangling/ { print $2 }')
1649 [ $repaired -ge 32 ] ||
1650 error "(9) Fail to repair dangling reference: $repaired"
1652 stop_full_debug_logging
1654 echo "stopall to cleanup object cache"
1657 setupall > /dev/null
1659 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1663 echo "The OST-object referenced by the MDT-object should be there;"
1664 echo "otherwise, the LFSCK should re-create the missing OST-object."
1665 echo "with '--delay-create-ostobj' option."
1668 check_mount_and_prep
1669 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1671 echo "Inject failure stub to simulate dangling referenced MDT-object"
1672 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1673 do_facet ost1 $LCTL set_param fail_loc=0x1610
1674 local count=$(precreated_ost_obj_count 0 0)
1676 createmany -o $DIR/$tdir/f $((count + 31))
1677 touch $DIR/$tdir/guard
1678 do_facet ost1 $LCTL set_param fail_loc=0
1680 start_full_debug_logging
1682 # exhaust other pre-created dangling cases
1683 count=$(precreated_ost_obj_count 0 0)
1684 createmany -o $DIR/$tdir/a $count ||
1685 error "(0) Fail to create $count files."
1687 echo "'ls' should fail because of dangling referenced MDT-object"
1688 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1690 echo "Trigger layout LFSCK to find out dangling reference"
1691 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1693 wait_all_targets_blocked layout completed 3
1695 local repaired=$($SHOW_LAYOUT |
1696 awk '/^repaired_dangling/ { print $2 }')
1697 [ $repaired -ge 32 ] ||
1698 error "(4) Fail to repair dangling reference: $repaired"
1700 echo "'stat' should fail because of not repair dangling by default"
1701 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1703 echo "Trigger layout LFSCK to repair dangling reference"
1704 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1706 wait_all_targets_blocked layout completed 7
1708 # There may be some async LFSCK updates in processing, wait for
1709 # a while until the target reparation has been done. LU-4970.
1711 echo "'stat' should success after layout LFSCK repairing"
1712 wait_update_facet client "stat $DIR/$tdir/guard |
1713 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1714 stat $DIR/$tdir/guard
1716 error "(8) unexpected size"
1719 repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_dangling/ { print $2 }')
1721 [ $repaired -ge 32 ] ||
1722 error "(9) Fail to repair dangling reference: $repaired"
1724 stop_full_debug_logging
1726 echo "stopall to cleanup object cache"
1729 setupall > /dev/null
1731 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1735 echo "If the OST-object referenced by the MDT-object back points"
1736 echo "to some non-exist MDT-object, then the LFSCK should repair"
1737 echo "the OST-object to back point to the right MDT-object."
1740 check_mount_and_prep
1741 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1743 echo "Inject failure stub to make the OST-object to back point to"
1744 echo "non-exist MDT-object."
1745 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1747 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1748 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1749 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1751 error "(0) Fail to create PFL $DIR/$tdir/f1"
1752 # 'dd' will trigger punch RPC firstly on every OST-objects.
1753 # So even though some OST-object will not be write by 'dd',
1754 # as long as it is allocated (may be NOT allocated in pfl_3b)
1755 # its layout information will be set also.
1756 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1757 cancel_lru_locks osc
1758 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1760 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1761 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1764 mdd.${MDT_DEV}.lfsck_layout |
1765 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1767 error "(2) unexpected status"
1770 local repaired=$($SHOW_LAYOUT |
1771 awk '/^repaired_unmatched_pair/ { print $2 }')
1772 [ $repaired -ge 3 ] ||
1773 error "(3) Fail to repair unmatched pair: $repaired"
1775 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1779 echo "If the OST-object referenced by the MDT-object back points"
1780 echo "to other MDT-object that doesn't recognize the OST-object,"
1781 echo "then the LFSCK should repair it to back point to the right"
1782 echo "MDT-object (the first one)."
1785 check_mount_and_prep
1786 mkdir -p $DIR/$tdir/0
1787 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1788 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1789 cancel_lru_locks osc
1791 echo "Inject failure stub to make the OST-object to back point to"
1792 echo "other MDT-object"
1795 [ $OSTCOUNT -ge 2 ] && stripes=2
1797 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1798 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1799 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1800 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1802 error "(0) Fail to create PFL $DIR/$tdir/f1"
1803 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1804 cancel_lru_locks osc
1805 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1807 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1808 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1810 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1811 mdd.${MDT_DEV}.lfsck_layout |
1812 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1814 error "(2) unexpected status"
1817 local repaired=$($SHOW_LAYOUT |
1818 awk '/^repaired_unmatched_pair/ { print $2 }')
1819 [ $repaired -eq 4 ] ||
1820 error "(3) Fail to repair unmatched pair: $repaired"
1822 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1825 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1827 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1828 skip "Skip the test after 2.7.55 see LU-6437" && return
1831 echo "According to current metadata migration implementation,"
1832 echo "before the old MDT-object is removed, both the new MDT-object"
1833 echo "and old MDT-object will reference the same LOV layout. Then if"
1834 echo "the layout LFSCK finds the new MDT-object by race, it will"
1835 echo "regard related OST-object(s) as multiple referenced case, and"
1836 echo "will try to create new OST-object(s) for the new MDT-object."
1837 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1838 echo "MDT-object before confirm the multiple referenced case."
1841 check_mount_and_prep
1842 $LFS mkdir -i 1 $DIR/$tdir/a1
1843 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1844 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1845 cancel_lru_locks osc
1847 echo "Inject failure stub on MDT1 to delay the migration"
1849 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1850 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1851 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1852 $LFS migrate -m 0 $DIR/$tdir/a1 &
1855 echo "Trigger layout LFSCK to race with the migration"
1856 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1858 wait_all_targets_blocked layout completed 2
1860 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1861 local repaired=$($SHOW_LAYOUT |
1862 awk '/^repaired_unmatched_pair/ { print $2 }')
1863 [ $repaired -eq 1 ] ||
1864 error "(3) Fail to repair unmatched pair: $repaired"
1866 repaired=$($SHOW_LAYOUT |
1867 awk '/^repaired_multiple_referenced/ { print $2 }')
1868 [ $repaired -eq 0 ] ||
1869 error "(4) Unexpectedly repaird multiple references: $repaired"
1871 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1875 echo "If the OST-object's owner information does not match the owner"
1876 echo "information stored in the MDT-object, then the LFSCK trust the"
1877 echo "MDT-object and update the OST-object's owner information."
1880 check_mount_and_prep
1881 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1882 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1883 cancel_lru_locks osc
1885 # created but no setattr or write to the file.
1887 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1888 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1890 echo "Inject failure stub to skip OST-object owner changing"
1891 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1893 chown 1.1 $DIR/$tdir/f0
1894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1896 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1899 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1902 mdd.${MDT_DEV}.lfsck_layout |
1903 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1905 error "(2) unexpected status"
1908 local repaired=$($SHOW_LAYOUT |
1909 awk '/^repaired_inconsistent_owner/ { print $2 }')
1910 [ $repaired -eq 1 ] ||
1911 error "(3) Fail to repair inconsistent owner: $repaired"
1913 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1917 echo "If more than one MDT-objects reference the same OST-object,"
1918 echo "and the OST-object only recognizes one MDT-object, then the"
1919 echo "LFSCK should create new OST-objects for such non-recognized"
1923 check_mount_and_prep
1924 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1926 echo "Inject failure stub to make two MDT-objects to refernce"
1927 echo "the OST-object"
1929 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1930 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1931 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1932 cancel_lru_locks mdc
1933 cancel_lru_locks osc
1935 createmany -o $DIR/$tdir/f 1
1936 cancel_lru_locks mdc
1937 cancel_lru_locks osc
1939 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1941 error "(0) Fail to create PFL $DIR/$tdir/f1"
1942 cancel_lru_locks mdc
1943 cancel_lru_locks osc
1944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1946 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1947 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1948 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1949 [ $size -eq 1048576 ] ||
1950 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1952 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1953 [ $size -eq 1048576 ] ||
1954 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1956 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1959 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1961 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1962 mdd.${MDT_DEV}.lfsck_layout |
1963 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1965 error "(3) unexpected status"
1968 local repaired=$($SHOW_LAYOUT |
1969 awk '/^repaired_multiple_referenced/ { print $2 }')
1970 [ $repaired -eq 2 ] ||
1971 error "(4) Fail to repair multiple references: $repaired"
1973 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1974 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1975 error "(5) Fail to write f0."
1976 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1977 [ $size -eq 1048576 ] ||
1978 error "(6) guard size should be 1048576, but got $size"
1980 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1981 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1982 error "(7) Fail to write f1."
1983 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1984 [ $size -eq 1048576 ] ||
1985 error "(8) guard size should be 1048576, but got $size"
1987 run_test 17 "LFSCK can repair multiple references"
1989 $LCTL set_param debug=+cache > /dev/null
1993 echo "The target MDT-object is there, but related stripe information"
1994 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1995 echo "layout EA entries."
1998 check_mount_and_prep
1999 $LFS mkdir -i 0 $DIR/$tdir/a1
2000 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2001 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2003 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2005 $LFS path2fid $DIR/$tdir/a1/f1
2006 $LFS getstripe $DIR/$tdir/a1/f1
2008 if [ $MDSCOUNT -ge 2 ]; then
2009 $LFS mkdir -i 1 $DIR/$tdir/a2
2010 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2011 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2012 $LFS path2fid $DIR/$tdir/a2/f2
2013 $LFS getstripe $DIR/$tdir/a2/f2
2016 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2017 error "(0) Fail to create PFL $DIR/$tdir/f3"
2019 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2021 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2023 $LFS path2fid $DIR/$tdir/f3
2024 $LFS getstripe $DIR/$tdir/f3
2026 cancel_lru_locks osc
2028 echo "Inject failure, to make the MDT-object lost its layout EA"
2029 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2030 do_facet mds1 $LCTL set_param fail_loc=0x1615
2031 chown 1.1 $DIR/$tdir/a1/f1
2033 if [ $MDSCOUNT -ge 2 ]; then
2034 do_facet mds2 $LCTL set_param fail_loc=0x1615
2035 chown 1.1 $DIR/$tdir/a2/f2
2038 chown 1.1 $DIR/$tdir/f3
2043 do_facet mds1 $LCTL set_param fail_loc=0
2044 if [ $MDSCOUNT -ge 2 ]; then
2045 do_facet mds2 $LCTL set_param fail_loc=0
2048 cancel_lru_locks mdc
2049 cancel_lru_locks osc
2051 echo "The file size should be incorrect since layout EA is lost"
2052 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2053 [ "$cur_size" != "$saved_size1" ] ||
2054 error "(1) Expect incorrect file1 size"
2056 if [ $MDSCOUNT -ge 2 ]; then
2057 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2058 [ "$cur_size" != "$saved_size1" ] ||
2059 error "(2) Expect incorrect file2 size"
2062 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2063 [ "$cur_size" != "$saved_size2" ] ||
2064 error "(1.2) Expect incorrect file3 size"
2066 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2067 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2069 for k in $(seq $MDSCOUNT); do
2070 # The LFSCK status query internal is 30 seconds. For the case
2071 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2072 # time to guarantee the status sync up.
2073 wait_update_facet mds${k} "$LCTL get_param -n \
2074 mdd.$(facet_svc mds${k}).lfsck_layout |
2075 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2076 error "(4) MDS${k} is not the expected 'completed'"
2079 for k in $(seq $OSTCOUNT); do
2080 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2081 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2082 awk '/^status/ { print $2 }')
2083 [ "$cur_status" == "completed" ] ||
2084 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2087 local repaired=$(do_facet mds1 $LCTL get_param -n \
2088 mdd.$(facet_svc mds1).lfsck_layout |
2089 awk '/^repaired_orphan/ { print $2 }')
2090 [ $repaired -eq 3 ] ||
2091 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2093 if [ $MDSCOUNT -ge 2 ]; then
2094 repaired=$(do_facet mds2 $LCTL get_param -n \
2095 mdd.$(facet_svc mds2).lfsck_layout |
2096 awk '/^repaired_orphan/ { print $2 }')
2097 [ $repaired -eq 2 ] ||
2098 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2101 $LFS path2fid $DIR/$tdir/a1/f1
2102 $LFS getstripe $DIR/$tdir/a1/f1
2104 if [ $MDSCOUNT -ge 2 ]; then
2105 $LFS path2fid $DIR/$tdir/a2/f2
2106 $LFS getstripe $DIR/$tdir/a2/f2
2109 $LFS path2fid $DIR/$tdir/f3
2110 $LFS getstripe $DIR/$tdir/f3
2112 echo "The file size should be correct after layout LFSCK scanning"
2113 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2114 [ "$cur_size" == "$saved_size1" ] ||
2115 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2117 if [ $MDSCOUNT -ge 2 ]; then
2118 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2119 [ "$cur_size" == "$saved_size1" ] ||
2120 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2123 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2124 [ "$cur_size" == "$saved_size2" ] ||
2125 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2127 run_test 18a "Find out orphan OST-object and repair it (1)"
2130 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2133 echo "The target MDT-object is lost. The LFSCK should re-create the"
2134 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2135 echo "can move it back to normal namespace manually."
2138 check_mount_and_prep
2139 $LFS mkdir -i 0 $DIR/$tdir/a1
2140 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2141 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2142 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2143 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2145 $LFS getstripe $DIR/$tdir/a1/f1
2147 if [ $MDSCOUNT -ge 2 ]; then
2148 $LFS mkdir -i 1 $DIR/$tdir/a2
2149 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2150 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2151 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2153 $LFS getstripe $DIR/$tdir/a2/f2
2156 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2157 error "(0) Fail to create PFL $DIR/$tdir/f3"
2159 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2161 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2162 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2164 $LFS getstripe $DIR/$tdir/f3
2166 cancel_lru_locks osc
2168 echo "Inject failure, to simulate the case of missing the MDT-object"
2169 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2170 do_facet mds1 $LCTL set_param fail_loc=0x1616
2171 rm -f $DIR/$tdir/a1/f1
2173 if [ $MDSCOUNT -ge 2 ]; then
2174 do_facet mds2 $LCTL set_param fail_loc=0x1616
2175 rm -f $DIR/$tdir/a2/f2
2183 do_facet mds1 $LCTL set_param fail_loc=0
2184 if [ $MDSCOUNT -ge 2 ]; then
2185 do_facet mds2 $LCTL set_param fail_loc=0
2188 cancel_lru_locks mdc
2189 cancel_lru_locks osc
2191 # dryrun mode only check orphans, not repaie
2192 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2193 $START_LAYOUT --dryrun -o -r ||
2194 error "Fail to start layout LFSCK in dryrun mode"
2195 wait_all_targets_blocked layout completed 2
2197 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2198 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2199 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2201 local orphans=$(do_facet mds1 $LCTL get_param -n \
2202 mdd.$(facet_svc mds1).lfsck_layout |
2203 awk '/^inconsistent_orphan/ { print $2 }')
2204 [ $orphans -eq 3 ] ||
2205 error "Expect 3 found on mds1, but got: $orphans"
2207 # orphan parents should not be created
2209 for subdir in $MOUNT/.lustre/lost+found/*; do
2210 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2213 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2214 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2216 for k in $(seq $MDSCOUNT); do
2217 # The LFSCK status query internal is 30 seconds. For the case
2218 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2219 # time to guarantee the status sync up.
2220 wait_update_facet mds${k} "$LCTL get_param -n \
2221 mdd.$(facet_svc mds${k}).lfsck_layout |
2222 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2223 error "(2) MDS${k} is not the expected 'completed'"
2226 for k in $(seq $OSTCOUNT); do
2227 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2228 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2229 awk '/^status/ { print $2 }')
2230 [ "$cur_status" == "completed" ] ||
2231 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2234 local repaired=$(do_facet mds1 $LCTL get_param -n \
2235 mdd.$(facet_svc mds1).lfsck_layout |
2236 awk '/^repaired_orphan/ { print $2 }')
2237 [ $repaired -eq 3 ] ||
2238 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2240 if [ $MDSCOUNT -ge 2 ]; then
2241 repaired=$(do_facet mds2 $LCTL get_param -n \
2242 mdd.$(facet_svc mds2).lfsck_layout |
2243 awk '/^repaired_orphan/ { print $2 }')
2244 [ $repaired -eq 2 ] ||
2245 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2248 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2249 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2250 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2252 if [ $MDSCOUNT -ge 2 ]; then
2253 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2254 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2257 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2258 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2260 $LFS path2fid $DIR/$tdir/a1/f1
2261 $LFS getstripe $DIR/$tdir/a1/f1
2263 if [ $MDSCOUNT -ge 2 ]; then
2264 $LFS path2fid $DIR/$tdir/a2/f2
2265 $LFS getstripe $DIR/$tdir/a2/f2
2268 $LFS path2fid $DIR/$tdir/f3
2269 $LFS getstripe $DIR/$tdir/f3
2271 echo "The file size should be correct after layout LFSCK scanning"
2272 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2273 [ "$cur_size" == "$saved_size1" ] ||
2274 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2276 if [ $MDSCOUNT -ge 2 ]; then
2277 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2278 [ "$cur_size" == "$saved_size1" ] ||
2279 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2282 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2283 [ "$cur_size" == "$saved_size2" ] ||
2284 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2286 run_test 18b "Find out orphan OST-object and repair it (2)"
2289 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2292 echo "The target MDT-object is lost, and the OST-object FID is missing."
2293 echo "The LFSCK should re-create the MDT-object with new FID under the "
2294 echo "directory .lustre/lost+found/MDTxxxx."
2297 check_mount_and_prep
2298 $LFS mkdir -i 0 $DIR/$tdir/a1
2299 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2301 echo "Inject failure, to simulate the case of missing parent FID"
2302 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2303 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2305 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2306 $LFS getstripe $DIR/$tdir/a1/f1
2308 if [ $MDSCOUNT -ge 2 ]; then
2309 $LFS mkdir -i 1 $DIR/$tdir/a2
2310 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2311 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2312 $LFS getstripe $DIR/$tdir/a2/f2
2315 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2316 error "(0) Fail to create PFL $DIR/$tdir/f3"
2318 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2319 $LFS getstripe $DIR/$tdir/f3
2321 cancel_lru_locks osc
2322 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2324 echo "Inject failure, to simulate the case of missing the MDT-object"
2325 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2326 do_facet mds1 $LCTL set_param fail_loc=0x1616
2327 rm -f $DIR/$tdir/a1/f1
2329 if [ $MDSCOUNT -ge 2 ]; then
2330 do_facet mds2 $LCTL set_param fail_loc=0x1616
2331 rm -f $DIR/$tdir/a2/f2
2339 do_facet mds1 $LCTL set_param fail_loc=0
2340 if [ $MDSCOUNT -ge 2 ]; then
2341 do_facet mds2 $LCTL set_param fail_loc=0
2344 cancel_lru_locks mdc
2345 cancel_lru_locks osc
2347 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2348 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2350 for k in $(seq $MDSCOUNT); do
2351 # The LFSCK status query internal is 30 seconds. For the case
2352 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2353 # time to guarantee the status sync up.
2354 wait_update_facet mds${k} "$LCTL get_param -n \
2355 mdd.$(facet_svc mds${k}).lfsck_layout |
2356 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2357 error "(2) MDS${k} is not the expected 'completed'"
2360 for k in $(seq $OSTCOUNT); do
2361 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2362 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2363 awk '/^status/ { print $2 }')
2364 [ "$cur_status" == "completed" ] ||
2365 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2368 if [ $MDSCOUNT -ge 2 ]; then
2374 local repaired=$(do_facet mds1 $LCTL get_param -n \
2375 mdd.$(facet_svc mds1).lfsck_layout |
2376 awk '/^repaired_orphan/ { print $2 }')
2377 [ $repaired -eq $expected ] ||
2378 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2380 if [ $MDSCOUNT -ge 2 ]; then
2381 repaired=$(do_facet mds2 $LCTL get_param -n \
2382 mdd.$(facet_svc mds2).lfsck_layout |
2383 awk '/^repaired_orphan/ { print $2 }')
2384 [ $repaired -eq 0 ] ||
2385 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2388 ls -ail $MOUNT/.lustre/lost+found/
2390 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2391 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2392 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2394 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2397 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2398 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2399 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2401 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2402 [ ! -z "$cname" ] ||
2403 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2405 run_test 18c "Find out orphan OST-object and repair it (3)"
2409 echo "The target MDT-object layout EA is corrupted, but the right"
2410 echo "OST-object is still alive as orphan. The layout LFSCK will"
2411 echo "not create new OST-object to occupy such slot."
2414 check_mount_and_prep
2416 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2417 echo "guard" > $DIR/$tdir/a1/f1
2418 echo "foo" > $DIR/$tdir/a1/f2
2420 echo "guard" > $DIR/$tdir/a1/f3
2421 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2422 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2423 echo "foo" > $DIR/$tdir/a1/f4
2425 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2426 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2427 $LFS path2fid $DIR/$tdir/a1/f1
2428 $LFS getstripe $DIR/$tdir/a1/f1
2429 $LFS path2fid $DIR/$tdir/a1/f2
2430 $LFS getstripe $DIR/$tdir/a1/f2
2431 $LFS path2fid $DIR/$tdir/a1/f3
2432 $LFS getstripe $DIR/$tdir/a1/f3
2433 $LFS path2fid $DIR/$tdir/a1/f4
2434 $LFS getstripe $DIR/$tdir/a1/f4
2435 cancel_lru_locks osc
2437 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2438 echo "to reference the same OST-object (which is f1's OST-obejct)."
2439 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2440 echo "dangling reference case, but f2's old OST-object is there."
2442 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2443 echo "to reference the same OST-object (which is f3's OST-obejct)."
2444 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2445 echo "dangling reference case, but f4's old OST-object is there."
2448 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2450 chown 1.1 $DIR/$tdir/a1/f2
2451 chown 1.1 $DIR/$tdir/a1/f4
2452 rm -f $DIR/$tdir/a1/f1
2453 rm -f $DIR/$tdir/a1/f3
2456 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2458 echo "stopall to cleanup object cache"
2461 setupall > /dev/null
2463 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2464 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2466 for k in $(seq $MDSCOUNT); do
2467 # The LFSCK status query internal is 30 seconds. For the case
2468 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2469 # time to guarantee the status sync up.
2470 wait_update_facet mds${k} "$LCTL get_param -n \
2471 mdd.$(facet_svc mds${k}).lfsck_layout |
2472 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2473 error "(3) MDS${k} is not the expected 'completed'"
2476 for k in $(seq $OSTCOUNT); do
2477 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2478 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2479 awk '/^status/ { print $2 }')
2480 [ "$cur_status" == "completed" ] ||
2481 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2484 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2485 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2486 awk '/^repaired_orphan/ { print $2 }')
2487 [ $repaired -eq 2 ] ||
2488 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2490 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2491 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2492 awk '/^repaired_dangling/ { print $2 }')
2493 [ $repaired -eq 0 ] ||
2494 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2496 echo "The file size should be correct after layout LFSCK scanning"
2497 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2498 [ "$cur_size" == "$saved_size1" ] ||
2499 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2501 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2502 [ "$cur_size" == "$saved_size2" ] ||
2503 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2505 echo "The LFSCK should find back the original data."
2506 cat $DIR/$tdir/a1/f2
2507 $LFS path2fid $DIR/$tdir/a1/f2
2508 $LFS getstripe $DIR/$tdir/a1/f2
2509 cat $DIR/$tdir/a1/f4
2510 $LFS path2fid $DIR/$tdir/a1/f4
2511 $LFS getstripe $DIR/$tdir/a1/f4
2513 run_test 18d "Find out orphan OST-object and repair it (4)"
2516 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2519 echo "The target MDT-object layout EA slot is occpuied by some new"
2520 echo "created OST-object when repair dangling reference case. Such"
2521 echo "conflict OST-object has been modified by others. To keep the"
2522 echo "new data, the LFSCK will create a new file to refernece this"
2523 echo "old orphan OST-object."
2526 check_mount_and_prep
2528 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2529 echo "guard" > $DIR/$tdir/a1/f1
2530 echo "foo" > $DIR/$tdir/a1/f2
2532 echo "guard" > $DIR/$tdir/a1/f3
2533 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2534 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2535 echo "foo" > $DIR/$tdir/a1/f4
2537 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2538 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2540 $LFS path2fid $DIR/$tdir/a1/f1
2541 $LFS getstripe $DIR/$tdir/a1/f1
2542 $LFS path2fid $DIR/$tdir/a1/f2
2543 $LFS getstripe $DIR/$tdir/a1/f2
2544 $LFS path2fid $DIR/$tdir/a1/f3
2545 $LFS getstripe $DIR/$tdir/a1/f3
2546 $LFS path2fid $DIR/$tdir/a1/f4
2547 $LFS getstripe $DIR/$tdir/a1/f4
2548 cancel_lru_locks osc
2550 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2551 echo "to reference the same OST-object (which is f1's OST-obejct)."
2552 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2553 echo "dangling reference case, but f2's old OST-object is there."
2555 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2556 echo "to reference the same OST-object (which is f3's OST-obejct)."
2557 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2558 echo "dangling reference case, but f4's old OST-object is there."
2561 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2562 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2563 chown 1.1 $DIR/$tdir/a1/f2
2564 chown 1.1 $DIR/$tdir/a1/f4
2565 rm -f $DIR/$tdir/a1/f1
2566 rm -f $DIR/$tdir/a1/f3
2569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2571 echo "stopall to cleanup object cache"
2574 setupall > /dev/null
2576 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2577 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2579 start_full_debug_logging
2581 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2582 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2584 wait_update_facet mds1 "$LCTL get_param -n \
2585 mdd.$(facet_svc mds1).lfsck_layout |
2586 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2587 error "(3) MDS1 is not the expected 'scanning-phase2'"
2589 # to guarantee all updates are synced.
2593 echo "Write new data to f2/f4 to modify the new created OST-object."
2594 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2595 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2597 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2599 for k in $(seq $MDSCOUNT); do
2600 # The LFSCK status query internal is 30 seconds. For the case
2601 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2602 # time to guarantee the status sync up.
2603 wait_update_facet mds${k} "$LCTL get_param -n \
2604 mdd.$(facet_svc mds${k}).lfsck_layout |
2605 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2606 error "(4) MDS${k} is not the expected 'completed'"
2609 for k in $(seq $OSTCOUNT); do
2610 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2611 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2612 awk '/^status/ { print $2 }')
2613 [ "$cur_status" == "completed" ] ||
2614 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2617 stop_full_debug_logging
2619 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2620 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2621 awk '/^repaired_orphan/ { print $2 }')
2622 [ $repaired -eq 2 ] ||
2623 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2625 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2626 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2627 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2629 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2630 if [ $count -ne 2 ]; then
2631 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2632 error "(8) Expect 2 stubs under lost+found, but got $count"
2635 echo "The stub file should keep the original f2 or f4 data"
2636 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2637 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2638 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2639 error "(9) Got unexpected $cur_size"
2642 $LFS path2fid $cname
2643 $LFS getstripe $cname
2645 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2646 cur_size=$(ls -il $cname | awk '{ print $6 }')
2647 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2648 error "(10) Got unexpected $cur_size"
2651 $LFS path2fid $cname
2652 $LFS getstripe $cname
2654 echo "The f2/f4 should contains new data."
2655 cat $DIR/$tdir/a1/f2
2656 $LFS path2fid $DIR/$tdir/a1/f2
2657 $LFS getstripe $DIR/$tdir/a1/f2
2658 cat $DIR/$tdir/a1/f4
2659 $LFS path2fid $DIR/$tdir/a1/f4
2660 $LFS getstripe $DIR/$tdir/a1/f4
2662 run_test 18e "Find out orphan OST-object and repair it (5)"
2665 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2668 echo "The target MDT-object is lost. The LFSCK should re-create the"
2669 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2670 echo "to verify some OST-object(s) during the first stage-scanning,"
2671 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2672 echo "should not be affected."
2675 check_mount_and_prep
2676 $LFS mkdir -i 0 $DIR/$tdir/a1
2677 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2678 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2679 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2680 $LFS mkdir -i 0 $DIR/$tdir/a2
2681 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2682 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2683 $LFS getstripe $DIR/$tdir/a1/f1
2684 $LFS getstripe $DIR/$tdir/a2/f2
2686 if [ $MDSCOUNT -ge 2 ]; then
2687 $LFS mkdir -i 1 $DIR/$tdir/a3
2688 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2689 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2690 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2691 $LFS mkdir -i 1 $DIR/$tdir/a4
2692 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2693 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2694 $LFS getstripe $DIR/$tdir/a3/f3
2695 $LFS getstripe $DIR/$tdir/a4/f4
2698 cancel_lru_locks osc
2700 echo "Inject failure, to simulate the case of missing the MDT-object"
2701 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2702 do_facet mds1 $LCTL set_param fail_loc=0x1616
2703 rm -f $DIR/$tdir/a1/f1
2704 rm -f $DIR/$tdir/a2/f2
2706 if [ $MDSCOUNT -ge 2 ]; then
2707 do_facet mds2 $LCTL set_param fail_loc=0x1616
2708 rm -f $DIR/$tdir/a3/f3
2709 rm -f $DIR/$tdir/a4/f4
2715 do_facet mds1 $LCTL set_param fail_loc=0
2716 if [ $MDSCOUNT -ge 2 ]; then
2717 do_facet mds2 $LCTL set_param fail_loc=0
2720 cancel_lru_locks mdc
2721 cancel_lru_locks osc
2723 echo "Inject failure, to simulate the OST0 fail to handle"
2724 echo "MDT0 LFSCK request during the first-stage scanning."
2725 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2726 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2728 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2729 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2731 for k in $(seq $MDSCOUNT); do
2732 # The LFSCK status query internal is 30 seconds. For the case
2733 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2734 # time to guarantee the status sync up.
2735 wait_update_facet mds${k} "$LCTL get_param -n \
2736 mdd.$(facet_svc mds${k}).lfsck_layout |
2737 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2738 error "(2) MDS${k} is not the expected 'partial'"
2741 wait_update_facet ost1 "$LCTL get_param -n \
2742 obdfilter.$(facet_svc ost1).lfsck_layout |
2743 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2744 error "(3) OST1 is not the expected 'partial'"
2747 wait_update_facet ost2 "$LCTL get_param -n \
2748 obdfilter.$(facet_svc ost2).lfsck_layout |
2749 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2750 error "(4) OST2 is not the expected 'completed'"
2753 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2755 local repaired=$(do_facet mds1 $LCTL get_param -n \
2756 mdd.$(facet_svc mds1).lfsck_layout |
2757 awk '/^repaired_orphan/ { print $2 }')
2758 [ $repaired -eq 1 ] ||
2759 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2761 if [ $MDSCOUNT -ge 2 ]; then
2762 repaired=$(do_facet mds2 $LCTL get_param -n \
2763 mdd.$(facet_svc mds2).lfsck_layout |
2764 awk '/^repaired_orphan/ { print $2 }')
2765 [ $repaired -eq 1 ] ||
2766 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2769 echo "Trigger layout LFSCK on all devices again to cleanup"
2770 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2772 for k in $(seq $MDSCOUNT); do
2773 # The LFSCK status query internal is 30 seconds. For the case
2774 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2775 # time to guarantee the status sync up.
2776 wait_update_facet mds${k} "$LCTL get_param -n \
2777 mdd.$(facet_svc mds${k}).lfsck_layout |
2778 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2779 error "(8) MDS${k} is not the expected 'completed'"
2782 for k in $(seq $OSTCOUNT); do
2783 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2784 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2785 awk '/^status/ { print $2 }')
2786 [ "$cur_status" == "completed" ] ||
2787 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2791 local repaired=$(do_facet mds1 $LCTL get_param -n \
2792 mdd.$(facet_svc mds1).lfsck_layout |
2793 awk '/^repaired_orphan/ { print $2 }')
2794 [ $repaired -eq 2 ] ||
2795 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2797 if [ $MDSCOUNT -ge 2 ]; then
2798 repaired=$(do_facet mds2 $LCTL get_param -n \
2799 mdd.$(facet_svc mds2).lfsck_layout |
2800 awk '/^repaired_orphan/ { print $2 }')
2801 [ $repaired -eq 2 ] ||
2802 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2805 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2808 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2811 echo "The target MDT-object is lost, but related OI mapping is there"
2812 echo "The LFSCK should recreate the lost MDT-object without affected"
2813 echo "by the stale OI mapping."
2816 check_mount_and_prep
2817 $LFS mkdir -i 0 $DIR/$tdir/a1
2818 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2819 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2820 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2822 $LFS getstripe $DIR/$tdir/a1/f1
2823 cancel_lru_locks osc
2825 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2826 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2827 do_facet mds1 $LCTL set_param fail_loc=0x162e
2828 rm -f $DIR/$tdir/a1/f1
2830 do_facet mds1 $LCTL set_param fail_loc=0
2831 cancel_lru_locks mdc
2832 cancel_lru_locks osc
2834 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2835 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2837 for k in $(seq $MDSCOUNT); do
2838 # The LFSCK status query internal is 30 seconds. For the case
2839 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2840 # time to guarantee the status sync up.
2841 wait_update_facet mds${k} "$LCTL get_param -n \
2842 mdd.$(facet_svc mds${k}).lfsck_layout |
2843 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2844 error "(2) MDS${k} is not the expected 'completed'"
2847 for k in $(seq $OSTCOUNT); do
2848 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2849 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2850 awk '/^status/ { print $2 }')
2851 [ "$cur_status" == "completed" ] ||
2852 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2855 local repaired=$(do_facet mds1 $LCTL get_param -n \
2856 mdd.$(facet_svc mds1).lfsck_layout |
2857 awk '/^repaired_orphan/ { print $2 }')
2858 [ $repaired -eq $OSTCOUNT ] ||
2859 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2861 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2862 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2863 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2865 $LFS path2fid $DIR/$tdir/a1/f1
2866 $LFS getstripe $DIR/$tdir/a1/f1
2868 run_test 18g "Find out orphan OST-object and repair it (7)"
2872 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2873 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2874 echo "scanning its OST-object(s). Then in the second stage scanning,"
2875 echo "the OST will return related OST-object(s) to the MDT as orphan."
2876 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2877 echo "the 'orphan(s)' stripe information."
2880 check_mount_and_prep
2882 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2883 error "(0) Fail to create PFL $DIR/$tdir/f0"
2885 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2886 error "(1.1) Fail to write $DIR/$tdir/f0"
2888 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2889 error "(1.2) Fail to write $DIR/$tdir/f0"
2891 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2893 echo "Inject failure stub to simulate bad PFL extent range"
2894 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2897 chown 1.1 $DIR/$tdir/f0
2899 cancel_lru_locks mdc
2900 cancel_lru_locks osc
2901 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2903 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2904 error "(2) Write to bad PFL file should fail"
2906 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2907 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2909 for k in $(seq $MDSCOUNT); do
2910 # The LFSCK status query internal is 30 seconds. For the case
2911 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2912 # time to guarantee the status sync up.
2913 wait_update_facet mds${k} "$LCTL get_param -n \
2914 mdd.$(facet_svc mds${k}).lfsck_layout |
2915 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2916 error "(4.1) MDS${k} is not the expected 'completed'"
2919 for k in $(seq $OSTCOUNT); do
2920 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2921 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2922 awk '/^status/ { print $2 }')
2923 [ "$cur_status" == "completed" ] ||
2924 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2928 local repaired=$($SHOW_LAYOUT |
2929 awk '/^repaired_orphan/ { print $2 }')
2930 [ $repaired -eq 2 ] ||
2931 error "(5) Fail to repair crashed PFL range: $repaired"
2933 echo "Data in $DIR/$tdir/f0 should not be broken"
2934 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2935 error "(6) Data in $DIR/$tdir/f0 is broken"
2937 echo "Write should succeed after LFSCK repairing the bad PFL range"
2938 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2939 error "(7) Write should succeed after LFSCK"
2941 run_test 18h "LFSCK can repair crashed PFL extent range"
2943 $LCTL set_param debug=-cache > /dev/null
2946 check_mount_and_prep
2947 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2949 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2950 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2952 echo "foo1" > $DIR/$tdir/a0
2953 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2954 error "(0) Fail to create PFL $DIR/$tdir/a1"
2955 echo "foo2" > $DIR/$tdir/a1
2956 echo "guard" > $DIR/$tdir/a2
2957 cancel_lru_locks osc
2959 echo "Inject failure, then client will offer wrong parent FID when read"
2960 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2961 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2963 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2964 $LCTL set_param fail_loc=0x1619
2966 echo "Read RPC with wrong parent FID should be denied"
2967 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2968 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2969 $LCTL set_param fail_loc=0
2971 run_test 19a "OST-object inconsistency self detect"
2974 check_mount_and_prep
2975 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2977 echo "Inject failure stub to make the OST-object to back point to"
2978 echo "non-exist MDT-object"
2980 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2981 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2983 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2984 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2985 echo "foo1" > $DIR/$tdir/f0
2986 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2987 error "(0) Fail to create PFL $DIR/$tdir/f1"
2988 echo "foo2" > $DIR/$tdir/f1
2989 cancel_lru_locks osc
2990 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2992 do_facet ost1 $LCTL set_param -n \
2993 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2994 echo "Nothing should be fixed since self detect and repair is disabled"
2995 local repaired=$(do_facet ost1 $LCTL get_param -n \
2996 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2997 awk '/^repaired/ { print $2 }')
2998 [ $repaired -eq 0 ] ||
2999 error "(1) Expected 0 repaired, but got $repaired"
3001 echo "Read RPC with right parent FID should be accepted,"
3002 echo "and cause parent FID on OST to be fixed"
3004 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3005 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3007 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3008 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3010 repaired=$(do_facet ost1 $LCTL get_param -n \
3011 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3012 awk '/^repaired/ { print $2 }')
3013 [ $repaired -eq 2 ] ||
3014 error "(3) Expected 1 repaired, but got $repaired"
3016 run_test 19b "OST-object inconsistency self repair"
3018 PATTERN_WITH_HOLE="40000001"
3019 PATTERN_WITHOUT_HOLE="raid0"
3022 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3023 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3026 echo "The target MDT-object and some of its OST-object are lost."
3027 echo "The LFSCK should find out the left OST-objects and re-create"
3028 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3029 echo "with the partial OST-objects (LOV EA hole)."
3031 echo "New client can access the file with LOV EA hole via normal"
3032 echo "system tools or commands without crash the system."
3034 echo "For old client, even though it cannot access the file with"
3035 echo "LOV EA hole, it should not cause the system crash."
3038 check_mount_and_prep
3039 $LFS mkdir -i 0 $DIR/$tdir/a1
3040 if [ $OSTCOUNT -gt 2 ]; then
3041 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3044 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3048 # 256 blocks on the stripe0.
3049 # 1 block on the stripe1 for 2 OSTs case.
3050 # 256 blocks on the stripe1 for other cases.
3051 # 1 block on the stripe2 if OSTs > 2
3052 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3053 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3054 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3056 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3057 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3058 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3061 $LFS getstripe $DIR/$tdir/a1/f0
3063 $LFS getstripe $DIR/$tdir/a1/f1
3065 $LFS getstripe $DIR/$tdir/a1/f2
3067 if [ $OSTCOUNT -gt 2 ]; then
3068 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3069 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3071 $LFS getstripe $DIR/$tdir/a1/f3
3074 cancel_lru_locks osc
3076 echo "Inject failure..."
3077 echo "To simulate f0 lost MDT-object"
3078 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3079 do_facet mds1 $LCTL set_param fail_loc=0x1616
3080 rm -f $DIR/$tdir/a1/f0
3082 echo "To simulate f1 lost MDT-object and OST-object0"
3083 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3084 do_facet mds1 $LCTL set_param fail_loc=0x161a
3085 rm -f $DIR/$tdir/a1/f1
3087 echo "To simulate f2 lost MDT-object and OST-object1"
3088 do_facet mds1 $LCTL set_param fail_val=1
3089 rm -f $DIR/$tdir/a1/f2
3091 if [ $OSTCOUNT -gt 2 ]; then
3092 echo "To simulate f3 lost MDT-object and OST-object2"
3093 do_facet mds1 $LCTL set_param fail_val=2
3094 rm -f $DIR/$tdir/a1/f3
3097 umount_client $MOUNT
3100 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3102 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3103 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3105 for k in $(seq $MDSCOUNT); do
3106 # The LFSCK status query internal is 30 seconds. For the case
3107 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3108 # time to guarantee the status sync up.
3109 wait_update_facet mds${k} "$LCTL get_param -n \
3110 mdd.$(facet_svc mds${k}).lfsck_layout |
3111 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3112 error "(2) MDS${k} is not the expected 'completed'"
3115 for k in $(seq $OSTCOUNT); do
3116 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3117 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3118 awk '/^status/ { print $2 }')
3119 [ "$cur_status" == "completed" ] ||
3120 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3123 local repaired=$(do_facet mds1 $LCTL get_param -n \
3124 mdd.$(facet_svc mds1).lfsck_layout |
3125 awk '/^repaired_orphan/ { print $2 }')
3126 if [ $OSTCOUNT -gt 2 ]; then
3127 [ $repaired -eq 9 ] ||
3128 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3130 [ $repaired -eq 4 ] ||
3131 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3134 mount_client $MOUNT || error "(5.0) Fail to start client!"
3136 LOV_PATTERN_F_HOLE=0x40000000
3139 # ${fid0}-R-0 is the old f0
3141 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3142 echo "Check $name, which is the old f0"
3144 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3146 local pattern=$($LFS getstripe -L $name)
3147 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3148 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3150 local stripes=$($LFS getstripe -c $name)
3151 if [ $OSTCOUNT -gt 2 ]; then
3152 [ $stripes -eq 3 ] ||
3153 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3155 [ $stripes -eq 2 ] ||
3156 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3159 local size=$(stat $name | awk '/Size:/ { print $2 }')
3160 [ $size -eq $((4096 * $bcount)) ] ||
3161 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3163 cat $name > /dev/null || error "(5.5) cannot read $name"
3165 echo "dummy" >> $name || error "(5.6) cannot write $name"
3167 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3169 touch $name || error "(5.8) cannot touch $name"
3171 rm -f $name || error "(5.9) cannot unlink $name"
3174 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3176 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3177 if [ $OSTCOUNT -gt 2 ]; then
3178 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3180 echo "Check $name, it contains the old f1's stripe1"
3183 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3185 pattern=$($LFS getstripe -L $name)
3186 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3187 error "(6.2) expect pattern flag hole, but got $pattern"
3189 stripes=$($LFS getstripe -c $name)
3190 if [ $OSTCOUNT -gt 2 ]; then
3191 [ $stripes -eq 3 ] ||
3192 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3194 [ $stripes -eq 2 ] ||
3195 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3198 size=$(stat $name | awk '/Size:/ { print $2 }')
3199 [ $size -eq $((4096 * $bcount)) ] ||
3200 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3202 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3204 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3205 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3208 [ $failures -eq 256 ] ||
3209 error "(6.6) expect 256 IO failures, but get $failures"
3211 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3212 [ $size -eq $((4096 * $bcount)) ] ||
3213 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3215 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3216 error "(6.8) write to the LOV EA hole should fail"
3218 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3219 error "(6.9) write to normal stripe should NOT fail"
3221 echo "foo" >> $name && error "(6.10) append write $name should fail"
3223 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3225 touch $name || error "(6.12) cannot touch $name"
3227 rm -f $name || error "(6.13) cannot unlink $name"
3230 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3232 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3233 if [ $OSTCOUNT -gt 2 ]; then
3234 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3236 echo "Check $name, it contains the old f2's stripe0"
3239 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3241 pattern=$($LFS getstripe -L $name)
3242 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3243 error "(7.2) expect pattern flag hole, but got $pattern"
3245 stripes=$($LFS getstripe -c $name)
3246 size=$(stat $name | awk '/Size:/ { print $2 }')
3247 if [ $OSTCOUNT -gt 2 ]; then
3248 [ $stripes -eq 3 ] ||
3249 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3251 [ $size -eq $((4096 * $bcount)) ] ||
3252 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3254 cat $name > /dev/null &&
3255 error "(7.5.1) normal read $name should fail"
3257 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3258 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3260 [ $failures -eq 256 ] ||
3261 error "(7.6) expect 256 IO failures, but get $failures"
3263 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3264 [ $size -eq $((4096 * $bcount)) ] ||
3265 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3267 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3268 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3270 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3271 error "(7.8.1) write to normal stripe should NOT fail"
3273 echo "foo" >> $name &&
3274 error "(7.8.3) append write $name should fail"
3276 chown $RUNAS_ID:$RUNAS_GID $name ||
3277 error "(7.9.1) cannot chown on $name"
3279 touch $name || error "(7.10.1) cannot touch $name"
3281 [ $stripes -eq 2 ] ||
3282 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3285 [ $size -eq $((4096 * (256 + 0))) ] ||
3286 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3288 cat $name > /dev/null &&
3289 error "(7.5.2) normal read $name should fail"
3291 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3292 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3293 [ $failures -eq 256 ] ||
3294 error "(7.6.2) expect 256 IO failures, but get $failures"
3297 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3298 [ $size -eq $((4096 * $bcount)) ] ||
3299 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3301 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3302 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3304 chown $RUNAS_ID:$RUNAS_GID $name ||
3305 error "(7.9.2) cannot chown on $name"
3307 touch $name || error "(7.10.2) cannot touch $name"
3310 rm -f $name || error "(7.11) cannot unlink $name"
3312 [ $OSTCOUNT -le 2 ] && return
3315 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3317 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3318 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3320 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3322 pattern=$($LFS getstripe -L $name)
3323 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3324 error "(8.2) expect pattern flag hole, but got $pattern"
3326 stripes=$($LFS getstripe -c $name)
3327 [ $stripes -eq 3 ] ||
3328 error "(8.3) expect the stripe count is 3, but got $stripes"
3330 size=$(stat $name | awk '/Size:/ { print $2 }')
3332 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3333 error "(8.4) expect the size $((4096 * 512)), but got $size"
3335 cat $name > /dev/null &&
3336 error "(8.5) normal read $name should fail"
3338 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3339 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3341 [ $failures -eq 256 ] ||
3342 error "(8.6) expect 256 IO failures, but get $failures"
3345 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3346 [ $size -eq $((4096 * $bcount)) ] ||
3347 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3349 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3350 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3352 chown $RUNAS_ID:$RUNAS_GID $name ||
3353 error "(8.9) cannot chown on $name"
3355 touch $name || error "(8.10) cannot touch $name"
3357 rm -f $name || error "(8.11) cannot unlink $name"
3359 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3362 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3363 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3366 echo "The target MDT-object and some of its OST-object are lost."
3367 echo "The LFSCK should find out the left OST-objects and re-create"
3368 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3369 echo "with the partial OST-objects (LOV EA hole)."
3371 echo "New client can access the file with LOV EA hole via normal"
3372 echo "system tools or commands without crash the system - PFL case."
3375 check_mount_and_prep
3377 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3378 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3379 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3380 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3381 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3382 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3384 local bcount=$((256 * 3 + 1))
3386 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3387 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3388 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3390 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3391 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3392 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3395 $LFS getstripe $DIR/$tdir/f0
3397 $LFS getstripe $DIR/$tdir/f1
3399 $LFS getstripe $DIR/$tdir/f2
3401 cancel_lru_locks mdc
3402 cancel_lru_locks osc
3404 echo "Inject failure..."
3405 echo "To simulate f0 lost MDT-object"
3406 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3407 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3410 echo "To simulate the case of f1 lost MDT-object and "
3411 echo "the first OST-object in each PFL component"
3412 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3413 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3416 echo "To simulate the case of f2 lost MDT-object and "
3417 echo "the second OST-object in each PFL component"
3418 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3425 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3426 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3428 for k in $(seq $MDSCOUNT); do
3429 # The LFSCK status query internal is 30 seconds. For the case
3430 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3431 # time to guarantee the status sync up.
3432 wait_update_facet mds${k} "$LCTL get_param -n \
3433 mdd.$(facet_svc mds${k}).lfsck_layout |
3434 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3435 error "(4) MDS${k} is not the expected 'completed'"
3438 for k in $(seq $OSTCOUNT); do
3439 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3440 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3441 awk '/^status/ { print $2 }')
3442 [ "$cur_status" == "completed" ] ||
3443 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3446 local repaired=$(do_facet mds1 $LCTL get_param -n \
3447 mdd.$(facet_svc mds1).lfsck_layout |
3448 awk '/^repaired_orphan/ { print $2 }')
3449 [ $repaired -eq 8 ] ||
3450 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3453 # ${fid0}-R-0 is the old f0
3455 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3456 echo "Check $name, which is the old f0"
3458 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3460 local pattern=$($LFS getstripe -L -I1 $name)
3461 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3462 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3464 pattern=$($LFS getstripe -L -I2 $name)
3465 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3466 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3468 local stripes=$($LFS getstripe -c -I1 $name)
3469 [ $stripes -eq 2 ] ||
3470 error "(7.3.1) expect 2 stripes, but got $stripes"
3472 stripes=$($LFS getstripe -c -I2 $name)
3473 [ $stripes -eq 2 ] ||
3474 error "(7.3.2) expect 2 stripes, but got $stripes"
3476 local e_start=$($LFS getstripe -I1 $name |
3477 awk '/lcme_extent.e_start:/ { print $2 }')
3478 [ $e_start -eq 0 ] ||
3479 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3481 local e_end=$($LFS getstripe -I1 $name |
3482 awk '/lcme_extent.e_end:/ { print $2 }')
3483 [ $e_end -eq 2097152 ] ||
3484 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3486 e_start=$($LFS getstripe -I2 $name |
3487 awk '/lcme_extent.e_start:/ { print $2 }')
3488 [ $e_start -eq 2097152 ] ||
3489 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3491 e_end=$($LFS getstripe -I2 $name |
3492 awk '/lcme_extent.e_end:/ { print $2 }')
3493 [ "$e_end" = "EOF" ] ||
3494 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3496 local size=$(stat $name | awk '/Size:/ { print $2 }')
3497 [ $size -eq $((4096 * $bcount)) ] ||
3498 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3500 cat $name > /dev/null || error "(7.7) cannot read $name"
3502 echo "dummy" >> $name || error "(7.8) cannot write $name"
3504 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3506 touch $name || error "(7.10) cannot touch $name"
3508 rm -f $name || error "(7.11) cannot unlink $name"
3511 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3513 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3514 echo "Check $name, it contains f1's second OST-object in each COMP"
3516 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3518 pattern=$($LFS getstripe -L -I1 $name)
3519 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3520 error "(8.2.1) expect pattern flag hole, but got $pattern"
3522 pattern=$($LFS getstripe -L -I2 $name)
3523 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3524 error "(8.2.2) expect pattern flag hole, but got $pattern"
3526 stripes=$($LFS getstripe -c -I1 $name)
3527 [ $stripes -eq 2 ] ||
3528 error "(8.3.2) expect 2 stripes, but got $stripes"
3530 stripes=$($LFS getstripe -c -I2 $name)
3531 [ $stripes -eq 2 ] ||
3532 error "(8.3.2) expect 2 stripes, but got $stripes"
3534 e_start=$($LFS getstripe -I1 $name |
3535 awk '/lcme_extent.e_start:/ { print $2 }')
3536 [ $e_start -eq 0 ] ||
3537 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3539 e_end=$($LFS getstripe -I1 $name |
3540 awk '/lcme_extent.e_end:/ { print $2 }')
3541 [ $e_end -eq 2097152 ] ||
3542 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3544 e_start=$($LFS getstripe -I2 $name |
3545 awk '/lcme_extent.e_start:/ { print $2 }')
3546 [ $e_start -eq 2097152 ] ||
3547 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3549 e_end=$($LFS getstripe -I2 $name |
3550 awk '/lcme_extent.e_end:/ { print $2 }')
3551 [ "$e_end" = "EOF" ] ||
3552 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3554 size=$(stat $name | awk '/Size:/ { print $2 }')
3555 [ $size -eq $((4096 * $bcount)) ] ||
3556 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3558 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3560 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3561 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3563 # The first stripe in each COMP was lost
3564 [ $failures -eq 512 ] ||
3565 error "(8.8) expect 512 IO failures, but get $failures"
3567 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3568 [ $size -eq $((4096 * $bcount)) ] ||
3569 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3571 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3572 error "(8.10) write to the LOV EA hole should fail"
3574 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3575 error "(8.11) write to normal stripe should NOT fail"
3577 echo "foo" >> $name && error "(8.12) append write $name should fail"
3579 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3581 touch $name || error "(8.14) cannot touch $name"
3583 rm -f $name || error "(8.15) cannot unlink $name"
3586 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3588 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3589 echo "Check $name, it contains f2's first stripe in each COMP"
3591 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3593 pattern=$($LFS getstripe -L -I1 $name)
3594 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3595 error "(9.2.1) expect pattern flag hole, but got $pattern"
3597 pattern=$($LFS getstripe -L -I2 $name)
3598 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3599 error "(9.2.2) expect pattern flag hole, but got $pattern"
3601 stripes=$($LFS getstripe -c -I1 $name)
3602 [ $stripes -eq 2 ] ||
3603 error "(9.3.2) expect 2 stripes, but got $stripes"
3605 stripes=$($LFS getstripe -c -I2 $name)
3606 [ $stripes -eq 2 ] ||
3607 error "(9.3.2) expect 2 stripes, but got $stripes"
3609 e_start=$($LFS getstripe -I1 $name |
3610 awk '/lcme_extent.e_start:/ { print $2 }')
3611 [ $e_start -eq 0 ] ||
3612 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3614 e_end=$($LFS getstripe -I1 $name |
3615 awk '/lcme_extent.e_end:/ { print $2 }')
3616 [ $e_end -eq 2097152 ] ||
3617 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3619 e_start=$($LFS getstripe -I2 $name |
3620 awk '/lcme_extent.e_start:/ { print $2 }')
3621 [ $e_start -eq 2097152 ] ||
3622 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3624 e_end=$($LFS getstripe -I2 $name |
3625 awk '/lcme_extent.e_end:/ { print $2 }')
3626 [ "$e_end" = "EOF" ] ||
3627 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3629 size=$(stat $name | awk '/Size:/ { print $2 }')
3630 # The second stripe in COMP was lost, so we do not know there
3631 # have ever been some data before. 'stat' will regard it as
3632 # no data on the lost stripe.
3634 [ $size -eq $((4096 * $bcount)) ] ||
3635 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3637 cat $name > /dev/null &&
3638 error "(9.7) normal read $name should fail"
3640 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3641 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3642 [ $failures -eq 512 ] ||
3643 error "(9.8) expect 256 IO failures, but get $failures"
3645 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3646 # The second stripe in COMP was lost, so we do not know there
3647 # have ever been some data before. Since 'dd' skip failure,
3648 # it will regard the lost stripe contains data.
3650 [ $size -eq $((4096 * $bcount)) ] ||
3651 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3653 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3654 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3656 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3657 error "(9.11) write to normal stripe should NOT fail"
3659 echo "foo" >> $name &&
3660 error "(9.12) append write $name should fail"
3662 chown $RUNAS_ID:$RUNAS_GID $name ||
3663 error "(9.13) cannot chown on $name"
3665 touch $name || error "(9.14) cannot touch $name"
3667 rm -f $name || error "(7.15) cannot unlink $name"
3669 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3672 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3673 skip "ignore the test if MDS is older than 2.5.59" && return
3675 check_mount_and_prep
3676 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3678 echo "Start all LFSCK components by default (-s 1)"
3679 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3680 error "Fail to start LFSCK"
3682 echo "namespace LFSCK should be in 'scanning-phase1' status"
3683 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3684 [ "$STATUS" == "scanning-phase1" ] ||
3685 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3687 echo "layout LFSCK should be in 'scanning-phase1' status"
3688 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3689 [ "$STATUS" == "scanning-phase1" ] ||
3690 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3692 echo "Stop all LFSCK components by default"
3693 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3694 error "Fail to stop LFSCK"
3696 run_test 21 "run all LFSCK components by default"
3699 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3702 echo "The parent_A references the child directory via some name entry,"
3703 echo "but the child directory back references another parent_B via its"
3704 echo "".." name entry. The parent_B does not exist. Then the namespace"
3705 echo "LFSCK will repair the child directory's ".." name entry."
3708 check_mount_and_prep
3710 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3711 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3713 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3714 echo "The dummy's dotdot name entry references the guard."
3715 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3716 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3717 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3718 error "(3) Fail to mkdir on MDT0"
3719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3721 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3723 echo "Trigger namespace LFSCK to repair unmatched pairs"
3724 $START_NAMESPACE -A -r ||
3725 error "(5) Fail to start LFSCK for namespace"
3727 wait_all_targets_blocked namespace completed 6
3729 local repaired=$($SHOW_NAMESPACE |
3730 awk '/^unmatched_pairs_repaired/ { print $2 }')
3731 [ $repaired -eq 1 ] ||
3732 error "(7) Fail to repair unmatched pairs: $repaired"
3734 echo "'ls' should success after namespace LFSCK repairing"
3735 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3736 error "(8) ls should success."
3738 run_test 22a "LFSCK can repair unmatched pairs (1)"
3741 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3744 echo "The parent_A references the child directory via the name entry_B,"
3745 echo "but the child directory back references another parent_C via its"
3746 echo "".." name entry. The parent_C exists, but there is no the name"
3747 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3748 echo "the child directory's ".." name entry and its linkEA."
3751 check_mount_and_prep
3753 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3754 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3756 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3757 echo "and bad linkEA. The dummy's dotdot name entry references the"
3758 echo "guard. The dummy's linkEA references n non-exist name entry."
3759 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3760 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3761 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3762 error "(3) Fail to mkdir on MDT0"
3763 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3765 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3766 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3767 local dummyname=$($LFS fid2path $DIR $dummyfid)
3768 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3769 error "(4) fid2path works unexpectedly."
3771 echo "Trigger namespace LFSCK to repair unmatched pairs"
3772 $START_NAMESPACE -A -r ||
3773 error "(5) Fail to start LFSCK for namespace"
3775 wait_all_targets_blocked namespace completed 6
3777 local repaired=$($SHOW_NAMESPACE |
3778 awk '/^unmatched_pairs_repaired/ { print $2 }')
3779 [ $repaired -eq 1 ] ||
3780 error "(7) Fail to repair unmatched pairs: $repaired"
3782 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3783 local dummyname=$($LFS fid2path $DIR $dummyfid)
3784 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3785 error "(8) fid2path does not work"
3787 run_test 22b "LFSCK can repair unmatched pairs (2)"
3790 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3793 echo "The name entry is there, but the MDT-object for such name "
3794 echo "entry does not exist. The namespace LFSCK should find out "
3795 echo "and repair the inconsistency as required."
3798 check_mount_and_prep
3800 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3801 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3803 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3804 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3805 do_facet mds2 $LCTL set_param fail_loc=0x1620
3806 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3807 do_facet mds2 $LCTL set_param fail_loc=0
3809 echo "'ls' should fail because of dangling name entry"
3810 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3812 echo "Trigger namespace LFSCK to find out dangling name entry"
3813 $START_NAMESPACE -A -r ||
3814 error "(5) Fail to start LFSCK for namespace"
3816 wait_all_targets_blocked namespace completed 6
3818 local repaired=$($SHOW_NAMESPACE |
3819 awk '/^dangling_repaired/ { print $2 }')
3820 [ $repaired -eq 1 ] ||
3821 error "(7) Fail to repair dangling name entry: $repaired"
3823 echo "'ls' should fail because not re-create MDT-object by default"
3824 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3826 echo "Trigger namespace LFSCK again to repair dangling name entry"
3827 $START_NAMESPACE -A -r -C ||
3828 error "(9) Fail to start LFSCK for namespace"
3830 wait_all_targets_blocked namespace completed 10
3832 repaired=$($SHOW_NAMESPACE |
3833 awk '/^dangling_repaired/ { print $2 }')
3834 [ $repaired -eq 1 ] ||
3835 error "(11) Fail to repair dangling name entry: $repaired"
3837 echo "'ls' should success after namespace LFSCK repairing"
3838 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3840 run_test 23a "LFSCK can repair dangling name entry (1)"
3844 echo "The objectA has multiple hard links, one of them corresponding"
3845 echo "to the name entry_B. But there is something wrong for the name"
3846 echo "entry_B and cause entry_B to references non-exist object_C."
3847 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3848 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3849 echo "comes to the second-stage scanning, it will find that the"
3850 echo "former re-creating object_C is not proper, and will try to"
3851 echo "replace the object_C with the real object_A."
3854 check_mount_and_prep
3856 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3857 $LFS path2fid $DIR/$tdir/d0
3859 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3861 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3862 $LFS path2fid $DIR/$tdir/d0/f0
3864 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3865 $LFS path2fid $DIR/$tdir/d0/f1
3867 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3868 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3870 if [ "$SEQ0" != "$SEQ1" ]; then
3871 # To guarantee that the f0 and f1 are in the same FID seq
3872 rm -f $DIR/$tdir/d0/f0 ||
3873 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3874 echo "dummy" > $DIR/$tdir/d0/f0 ||
3875 error "(3.2) Fail to touch on MDT0"
3876 $LFS path2fid $DIR/$tdir/d0/f0
3879 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3880 OID=$(printf %d $OID)
3882 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3883 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3884 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3885 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3886 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3888 # If there is creation after the dangling injection, it may re-use
3889 # the just released local object (inode) that is referenced by the
3890 # dangling name entry. It will fail the dangling injection.
3891 # So before deleting the target object for the dangling name entry,
3892 # remove some other objects to avoid the target object being reused
3893 # by some potential creations. LU-7429
3894 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3896 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3898 echo "'ls' should fail because of dangling name entry"
3899 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3900 error "(6) ls should fail."
3902 echo "Trigger namespace LFSCK to find out dangling name entry"
3903 $START_NAMESPACE -r -C ||
3904 error "(7) Fail to start LFSCK for namespace"
3906 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3907 mdd.${MDT_DEV}.lfsck_namespace |
3908 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3910 error "(8) unexpected status"
3913 local repaired=$($SHOW_NAMESPACE |
3914 awk '/^dangling_repaired/ { print $2 }')
3915 [ $repaired -eq 1 ] ||
3916 error "(9) Fail to repair dangling name entry: $repaired"
3918 repaired=$($SHOW_NAMESPACE |
3919 awk '/^multiple_linked_repaired/ { print $2 }')
3920 [ $repaired -eq 1 ] ||
3921 error "(10) Fail to drop the former created object: $repaired"
3923 local data=$(cat $DIR/$tdir/d0/foo)
3924 [ "$data" == "dummy" ] ||
3925 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3927 run_test 23b "LFSCK can repair dangling name entry (2)"
3930 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3931 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3932 mdd.${MDT_DEV}.lfsck_namespace |
3933 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3935 error "(10) unexpected status"
3938 stop_full_debug_logging
3943 echo "The objectA has multiple hard links, one of them corresponding"
3944 echo "to the name entry_B. But there is something wrong for the name"
3945 echo "entry_B and cause entry_B to references non-exist object_C."
3946 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3947 echo "as dangling, and re-create the lost object_C. And then others"
3948 echo "modified the re-created object_C. When the LFSCK comes to the"
3949 echo "second-stage scanning, it will find that the former re-creating"
3950 echo "object_C maybe wrong and try to replace the object_C with the"
3951 echo "real object_A. But because object_C has been modified, so the"
3952 echo "LFSCK cannot replace it."
3955 start_full_debug_logging
3957 check_mount_and_prep
3959 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3960 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3961 echo "parent_fid=$parent_fid"
3963 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3965 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3966 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3967 echo "f0_fid=$f0_fid"
3969 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3970 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3971 echo "f1_fid=$f1_fid"
3973 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3974 # To guarantee that the f0 and f1 are in the same FID seq
3975 rm -f $DIR/$tdir/d0/f0 ||
3976 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3977 echo "dummy" > $DIR/$tdir/d0/f0 ||
3978 error "(3.2) Fail to touch on MDT0"
3979 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3980 echo "f0_fid=$f0_fid (replaced)"
3983 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3985 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3986 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3987 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3988 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3989 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3991 # If there is creation after the dangling injection, it may re-use
3992 # the just released local object (inode) that is referenced by the
3993 # dangling name entry. It will fail the dangling injection.
3994 # So before deleting the target object for the dangling name entry,
3995 # remove some other objects to avoid the target object being reused
3996 # by some potential creations. LU-7429
3997 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3999 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4001 echo "'ls' should fail because of dangling name entry"
4002 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4003 error "(6) ls should fail."
4005 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4006 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4008 echo "Trigger namespace LFSCK to find out dangling name entry"
4009 $START_NAMESPACE -r -C ||
4010 error "(7) Fail to start LFSCK for namespace"
4012 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4013 # While unexpected by the test, it is valid for LFSCK to repair
4014 # the link to the original object before any data is written.
4015 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4017 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4018 log "LFSCK repaired file prematurely"
4023 stat $DIR/$tdir/d0/foo
4025 error "(8) unexpected size"
4028 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4029 cancel_lru_locks osc
4033 local repaired=$($SHOW_NAMESPACE |
4034 awk '/^dangling_repaired/ { print $2 }')
4035 [ $repaired -eq 1 ] ||
4036 error "(11) Fail to repair dangling name entry: $repaired"
4038 local data=$(cat $DIR/$tdir/d0/foo)
4039 [ "$data" != "dummy" ] ||
4040 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4042 run_test 23c "LFSCK can repair dangling name entry (3)"
4045 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4046 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4049 echo "Two MDT-objects back reference the same name entry via their"
4050 echo "each own linkEA entry, but the name entry only references one"
4051 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4052 echo "for the MDT-object that is not recognized. If such MDT-object"
4053 echo "has no other linkEA entry after the removing, then the LFSCK"
4054 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4057 check_mount_and_prep
4059 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4061 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4062 $LFS path2fid $DIR/$tdir/d0/guard
4064 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4065 $LFS path2fid $DIR/$tdir/d0/dummy
4068 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4069 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4071 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4074 touch $DIR/$tdir/d0/guard/foo ||
4075 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4077 echo "Inject failure stub on MDT0 to simulate the case that"
4078 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4079 echo "that references $DIR/$tdir/d0/guard/foo."
4080 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4081 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4082 echo "there with the same linkEA entry as another MDT-object"
4083 echo "$DIR/$tdir/d0/guard/foo has"
4085 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4086 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4087 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4088 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4089 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4090 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4091 rmdir $DIR/$tdir/d0/dummy/foo ||
4092 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4093 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4095 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4096 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4097 error "(6) stat successfully unexpectedly"
4099 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4100 $START_NAMESPACE -A -r ||
4101 error "(7) Fail to start LFSCK for namespace"
4103 wait_all_targets_blocked namespace completed 8
4105 local repaired=$($SHOW_NAMESPACE |
4106 awk '/^multiple_referenced_repaired/ { print $2 }')
4107 [ $repaired -eq 1 ] ||
4108 error "(9) Fail to repair multiple referenced name entry: $repaired"
4110 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4111 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4112 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4114 local cname="$cfid-$pfid-D-0"
4115 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4116 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4118 run_test 24 "LFSCK can repair multiple-referenced name entry"
4121 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4122 skip "ldiskfs only test" && return
4125 echo "The file type in the name entry does not match the file type"
4126 echo "claimed by the referenced object. Then the LFSCK will update"
4127 echo "the file type in the name entry."
4130 check_mount_and_prep
4132 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4134 echo "Inject failure stub on MDT0 to simulate the case that"
4135 echo "the file type stored in the name entry is wrong."
4137 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4139 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4140 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4142 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4143 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4145 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4146 mdd.${MDT_DEV}.lfsck_namespace |
4147 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4149 error "(4) unexpected status"
4152 local repaired=$($SHOW_NAMESPACE |
4153 awk '/^bad_file_type_repaired/ { print $2 }')
4154 [ $repaired -eq 1 ] ||
4155 error "(5) Fail to repair bad file type in name entry: $repaired"
4157 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4159 run_test 25 "LFSCK can repair bad file type in the name entry"
4163 echo "The local name entry back referenced by the MDT-object is lost."
4164 echo "The namespace LFSCK will add the missing local name entry back"
4165 echo "to the normal namespace."
4168 check_mount_and_prep
4170 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4171 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4172 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4174 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4175 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4177 echo "Inject failure stub on MDT0 to simulate the case that"
4178 echo "foo's name entry will be removed, but the foo's object"
4179 echo "and its linkEA are kept in the system."
4181 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4182 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4183 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4186 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4187 error "(5) 'ls' should fail"
4189 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4190 $START_NAMESPACE -r -A ||
4191 error "(6) Fail to start LFSCK for namespace"
4193 wait_all_targets_blocked namespace completed 7
4195 local repaired=$($SHOW_NAMESPACE |
4196 awk '/^lost_dirent_repaired/ { print $2 }')
4197 [ $repaired -eq 1 ] ||
4198 error "(8) Fail to repair lost dirent: $repaired"
4200 ls -ail $DIR/$tdir/d0/foo ||
4201 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4203 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4204 [ "$foofid" == "$foofid2" ] ||
4205 error "(10) foo's FID changed: $foofid, $foofid2"
4207 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4210 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4213 echo "The remote name entry back referenced by the MDT-object is lost."
4214 echo "The namespace LFSCK will add the missing remote name entry back"
4215 echo "to the normal namespace."
4218 check_mount_and_prep
4220 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4221 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4222 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4224 echo "Inject failure stub on MDT0 to simulate the case that"
4225 echo "foo's name entry will be removed, but the foo's object"
4226 echo "and its linkEA are kept in the system."
4228 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4229 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4230 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4233 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4234 error "(4) 'ls' should fail"
4236 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4237 $START_NAMESPACE -r -A ||
4238 error "(5) Fail to start LFSCK for namespace"
4240 wait_all_targets_blocked namespace completed 6
4242 local repaired=$($SHOW_NAMESPACE |
4243 awk '/^lost_dirent_repaired/ { print $2 }')
4244 [ $repaired -eq 1 ] ||
4245 error "(7) Fail to repair lost dirent: $repaired"
4247 ls -ail $DIR/$tdir/d0/foo ||
4248 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4250 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4251 [ "$foofid" == "$foofid2" ] ||
4252 error "(9) foo's FID changed: $foofid, $foofid2"
4254 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4257 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4260 echo "The local parent referenced by the MDT-object linkEA is lost."
4261 echo "The namespace LFSCK will re-create the lost parent as orphan."
4264 check_mount_and_prep
4266 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4267 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4268 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4269 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4271 echo "Inject failure stub on MDT0 to simulate the case that"
4272 echo "foo's name entry will be removed, but the foo's object"
4273 echo "and its linkEA are kept in the system. And then remove"
4274 echo "another hard link and the parent directory."
4276 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4277 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4278 rm -f $DIR/$tdir/d0/foo ||
4279 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4280 rm -f $DIR/$tdir/d0/dummy ||
4281 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4284 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4285 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4287 echo "Trigger namespace LFSCK to repair the lost parent"
4288 $START_NAMESPACE -r -A ||
4289 error "(6) Fail to start LFSCK for namespace"
4291 wait_all_targets_blocked namespace completed 7
4293 local repaired=$($SHOW_NAMESPACE |
4294 awk '/^lost_dirent_repaired/ { print $2 }')
4295 [ $repaired -eq 1 ] ||
4296 error "(8) Fail to repair lost dirent: $repaired"
4298 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4299 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4300 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4302 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4304 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4305 [ ! -z "$cname" ] ||
4306 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4308 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4311 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4312 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4315 echo "The remote parent referenced by the MDT-object linkEA is lost."
4316 echo "The namespace LFSCK will re-create the lost parent as orphan."
4319 check_mount_and_prep
4321 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4322 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4324 $LFS path2fid $DIR/$tdir/d0
4326 echo "Inject failure stub on MDT0 to simulate the case that"
4327 echo "foo's name entry will be removed, but the foo's object"
4328 echo "and its linkEA are kept in the system. And then remove"
4329 echo "the parent directory."
4331 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4332 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4333 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4334 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4336 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4337 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4339 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4340 $START_NAMESPACE -r -A ||
4341 error "(6) Fail to start LFSCK for namespace"
4343 wait_all_targets_blocked namespace completed 7
4345 local repaired=$($SHOW_NAMESPACE |
4346 awk '/^lost_dirent_repaired/ { print $2 }')
4347 [ $repaired -eq 1 ] ||
4348 error "(8) Fail to repair lost dirent: $repaired"
4350 ls -ail $MOUNT/.lustre/lost+found/
4352 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4353 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4354 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4356 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4358 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4359 [ ! -z "$cname" ] ||
4360 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4362 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4365 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4368 echo "The target name entry is lost. The LFSCK should insert the"
4369 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4370 echo "the MDT (on which the orphan MDT-object resides) has ever"
4371 echo "failed to respond some name entry verification during the"
4372 echo "first stage-scanning, then the LFSCK should skip to handle"
4373 echo "orphan MDT-object on this MDT. But other MDTs should not"
4377 check_mount_and_prep
4378 $LFS mkdir -i 0 $DIR/$tdir/d1
4379 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4380 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4382 $LFS mkdir -i 1 $DIR/$tdir/d2
4383 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4384 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4386 echo "Inject failure stub on MDT0 to simulate the case that"
4387 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4388 echo "and its linkEA are kept in the system. And the case that"
4389 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4390 echo "and its linkEA are kept in the system."
4392 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4393 do_facet mds1 $LCTL set_param fail_loc=0x1624
4394 do_facet mds2 $LCTL set_param fail_loc=0x1624
4395 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4396 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4397 do_facet mds1 $LCTL set_param fail_loc=0
4398 do_facet mds2 $LCTL set_param fail_loc=0
4400 cancel_lru_locks mdc
4401 cancel_lru_locks osc
4403 echo "Inject failure, to simulate the MDT0 fail to handle"
4404 echo "MDT1 LFSCK request during the first-stage scanning."
4405 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4406 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4408 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4409 $START_NAMESPACE -r -A ||
4410 error "(3) Fail to start LFSCK for namespace"
4412 wait_update_facet mds1 "$LCTL get_param -n \
4413 mdd.$(facet_svc mds1).lfsck_namespace |
4414 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4415 error "(4) mds1 is not the expected 'partial'"
4418 wait_update_facet mds2 "$LCTL get_param -n \
4419 mdd.$(facet_svc mds2).lfsck_namespace |
4420 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4421 error "(5) mds2 is not the expected 'completed'"
4424 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4426 local repaired=$(do_facet mds1 $LCTL get_param -n \
4427 mdd.$(facet_svc mds1).lfsck_namespace |
4428 awk '/^lost_dirent_repaired/ { print $2 }')
4429 [ $repaired -eq 0 ] ||
4430 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4432 repaired=$(do_facet mds2 $LCTL get_param -n \
4433 mdd.$(facet_svc mds2).lfsck_namespace |
4434 awk '/^lost_dirent_repaired/ { print $2 }')
4435 [ $repaired -eq 1 ] ||
4436 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4438 echo "Trigger namespace LFSCK on all devices again to cleanup"
4439 $START_NAMESPACE -r -A ||
4440 error "(8) Fail to start LFSCK for namespace"
4442 wait_all_targets_blocked namespace completed 9
4444 local repaired=$(do_facet mds1 $LCTL get_param -n \
4445 mdd.$(facet_svc mds1).lfsck_namespace |
4446 awk '/^lost_dirent_repaired/ { print $2 }')
4447 [ $repaired -eq 1 ] ||
4448 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4450 repaired=$(do_facet mds2 $LCTL get_param -n \
4451 mdd.$(facet_svc mds2).lfsck_namespace |
4452 awk '/^lost_dirent_repaired/ { print $2 }')
4453 [ $repaired -eq 0 ] ||
4454 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4456 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4460 echo "The object's nlink attribute is larger than the object's known"
4461 echo "name entries count. The LFSCK will repair the object's nlink"
4462 echo "attribute to match the known name entries count"
4465 check_mount_and_prep
4467 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4468 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4470 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4471 echo "nlink attribute is larger than its name entries count."
4473 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4474 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4475 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4476 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4479 cancel_lru_locks mdc
4480 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4481 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4483 echo "Trigger namespace LFSCK to repair the nlink count"
4484 $START_NAMESPACE -r -A ||
4485 error "(5) Fail to start LFSCK for namespace"
4487 wait_all_targets_blocked namespace completed 6
4489 local repaired=$($SHOW_NAMESPACE |
4490 awk '/^nlinks_repaired/ { print $2 }')
4491 [ $repaired -eq 1 ] ||
4492 error "(7) Fail to repair nlink count: $repaired"
4494 cancel_lru_locks mdc
4495 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4496 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4498 # Disable 29a, we only allow nlink to be updated if the known linkEA
4499 # entries is larger than nlink count.
4501 #run_test 29a "LFSCK can repair bad nlink count (1)"
4505 echo "The object's nlink attribute is smaller than the object's known"
4506 echo "name entries count. The LFSCK will repair the object's nlink"
4507 echo "attribute to match the known name entries count"
4510 check_mount_and_prep
4512 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4513 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4515 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4516 echo "nlink attribute is smaller than its name entries count."
4518 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4519 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4520 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4521 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4522 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4524 cancel_lru_locks mdc
4525 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4526 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4528 echo "Trigger namespace LFSCK to repair the nlink count"
4529 $START_NAMESPACE -r -A ||
4530 error "(5) Fail to start LFSCK for namespace"
4532 wait_all_targets_blocked namespace completed 6
4534 local repaired=$($SHOW_NAMESPACE |
4535 awk '/^nlinks_repaired/ { print $2 }')
4536 [ $repaired -eq 1 ] ||
4537 error "(7) Fail to repair nlink count: $repaired"
4539 cancel_lru_locks mdc
4540 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4541 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4543 run_test 29b "LFSCK can repair bad nlink count (2)"
4548 echo "The namespace LFSCK will create many hard links to the target"
4549 echo "file as to exceed the linkEA size limitation. Under such case"
4550 echo "the linkEA will be marked as overflow that will prevent the"
4551 echo "target file to be migrated. Then remove some hard links to"
4552 echo "make the left hard links to be held within the linkEA size"
4553 echo "limitation. But before the namespace LFSCK adding all the"
4554 echo "missed linkEA entries back, the overflow mark (timestamp)"
4555 echo "will not be cleared."
4558 check_mount_and_prep
4560 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4561 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4562 error "(0.2) Fail to mkdir"
4563 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4564 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4566 # define MAX_LINKEA_SIZE 4096
4567 # sizeof(link_ea_header) = 24
4568 # sizeof(link_ea_entry) = 18
4569 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4570 # (sizeof(link_ea_entry) + name_length))
4571 # If the average name length is 12 bytes, then 150 hard links
4572 # is totally enough to overflow the linkEA
4573 echo "Create 150 hard links should succeed although the linkEA overflow"
4574 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4575 error "(2) Fail to hard link"
4577 cancel_lru_locks mdc
4578 if [ $MDSCOUNT -ge 2 ]; then
4579 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4580 error "(3.1) Migrate should fail"
4582 echo "The object with linkEA overflow should NOT be migrated"
4583 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4584 [ "$newfid" == "$oldfid" ] ||
4585 error "(3.2) Migrate should fail: $newfid != $oldfid"
4588 # Remove 100 hard links, then the linkEA should have space
4589 # to hold the missed linkEA entries.
4590 echo "Remove 100 hard links to save space for the missed linkEA entries"
4591 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4593 if [ $MDSCOUNT -ge 2 ]; then
4594 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4595 error "(5.1) Migrate should fail"
4597 # The overflow timestamp is still there, so migration will fail.
4598 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4599 [ "$newfid" == "$oldfid" ] ||
4600 error "(5.2) Migrate should fail: $newfid != $oldfid"
4603 # sleep 3 seconds to guarantee that the overflow is recognized
4606 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4607 $START_NAMESPACE -r -A ||
4608 error "(6) Fail to start LFSCK for namespace"
4610 wait_all_targets_blocked namespace completed 7
4612 local repaired=$($SHOW_NAMESPACE |
4613 awk '/^linkea_overflow_cleared/ { print $2 }')
4614 [ $repaired -eq 1 ] ||
4615 error "(8) Fail to clear linkea overflow: $repaired"
4617 repaired=$($SHOW_NAMESPACE |
4618 awk '/^nlinks_repaired/ { print $2 }')
4619 [ $repaired -eq 0 ] ||
4620 error "(9) Unexpected nlink repaired: $repaired"
4622 if [ $MDSCOUNT -ge 2 ]; then
4623 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4624 error "(10.1) Migrate failure"
4626 # Migration should succeed after clear the overflow timestamp.
4627 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4628 [ "$newfid" != "$oldfid" ] ||
4629 error "(10.2) Migrate should succeed"
4631 ls -l $DIR/$tdir/foo > /dev/null ||
4632 error "(11) 'ls' failed after migration"
4635 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4636 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4638 run_test 29c "verify linkEA size limitation"
4641 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4642 skip "ldiskfs only test" && return
4643 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4646 echo "The namespace LFSCK will move the orphans from backend"
4647 echo "/lost+found directory to normal client visible namespace"
4648 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4651 check_mount_and_prep
4653 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4654 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4656 echo "Inject failure stub on MDT0 to simulate the case that"
4657 echo "directory d0 has no linkEA entry, then the LFSCK will"
4658 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4660 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4662 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4663 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4665 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4666 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4668 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4669 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4671 echo "Inject failure stub on MDT0 to simulate the case that the"
4672 echo "object's name entry will be removed, but not destroy the"
4673 echo "object. Then backend e2fsck will handle it as orphan and"
4674 echo "add them into the backend /lost+found directory."
4676 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4678 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4679 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4680 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4681 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4684 umount_client $MOUNT || error "(10) Fail to stop client!"
4686 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4689 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4690 error "(12) Fail to run e2fsck"
4692 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4693 error "(13) Fail to start MDT0"
4695 echo "Trigger namespace LFSCK to recover backend orphans"
4696 $START_NAMESPACE -r -A ||
4697 error "(14) Fail to start LFSCK for namespace"
4699 wait_all_targets_blocked namespace completed 15
4701 local repaired=$($SHOW_NAMESPACE |
4702 awk '/^local_lost_found_moved/ { print $2 }')
4703 [ $repaired -ge 4 ] ||
4704 error "(16) Fail to recover backend orphans: $repaired"
4706 mount_client $MOUNT || error "(17) Fail to start client!"
4708 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4710 ls -ail $MOUNT/.lustre/lost+found/
4712 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4713 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4714 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4716 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4718 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4719 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4721 stat ${cname}/d1 || error "(21) d1 is not recovered"
4722 stat ${cname}/f1 || error "(22) f1 is not recovered"
4724 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4727 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4730 echo "For the name entry under a striped directory, if the name"
4731 echo "hash does not match the shard, then the LFSCK will repair"
4732 echo "the bad name entry"
4735 check_mount_and_prep
4737 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4738 error "(1) Fail to create striped directory"
4740 echo "Inject failure stub on client to simulate the case that"
4741 echo "some name entry should be inserted into other non-first"
4742 echo "shard, but inserted into the first shard by wrong"
4744 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4745 $LCTL set_param fail_loc=0x1628 fail_val=0
4746 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4747 error "(2) Fail to create file under striped directory"
4748 $LCTL set_param fail_loc=0 fail_val=0
4750 echo "Trigger namespace LFSCK to repair bad name hash"
4751 $START_NAMESPACE -r -A ||
4752 error "(3) Fail to start LFSCK for namespace"
4754 wait_all_targets_blocked namespace completed 4
4756 local repaired=$($SHOW_NAMESPACE |
4757 awk '/^name_hash_repaired/ { print $2 }')
4758 [ $repaired -ge 1 ] ||
4759 error "(5) Fail to repair bad name hash: $repaired"
4761 umount_client $MOUNT || error "(6) umount failed"
4762 mount_client $MOUNT || error "(7) mount failed"
4764 for ((i = 0; i < $MDSCOUNT; i++)); do
4765 stat $DIR/$tdir/striped_dir/d$i ||
4766 error "(8) Fail to stat d$i after LFSCK"
4767 rmdir $DIR/$tdir/striped_dir/d$i ||
4768 error "(9) Fail to unlink d$i after LFSCK"
4771 rmdir $DIR/$tdir/striped_dir ||
4772 error "(10) Fail to remove the striped directory after LFSCK"
4774 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4777 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4780 echo "For the name entry under a striped directory, if the name"
4781 echo "hash does not match the shard, then the LFSCK will repair"
4782 echo "the bad name entry"
4785 check_mount_and_prep
4787 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4788 error "(1) Fail to create striped directory"
4790 echo "Inject failure stub on client to simulate the case that"
4791 echo "some name entry should be inserted into other non-second"
4792 echo "shard, but inserted into the secod shard by wrong"
4794 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4795 $LCTL set_param fail_loc=0x1628 fail_val=1
4796 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4797 error "(2) Fail to create file under striped directory"
4798 $LCTL set_param fail_loc=0 fail_val=0
4800 echo "Trigger namespace LFSCK to repair bad name hash"
4801 $START_NAMESPACE -r -A ||
4802 error "(3) Fail to start LFSCK for namespace"
4804 wait_all_targets_blocked namespace completed 4
4806 local repaired=$(do_facet mds2 $LCTL get_param -n \
4807 mdd.$(facet_svc mds2).lfsck_namespace |
4808 awk '/^name_hash_repaired/ { print $2 }')
4809 echo "repaired $repaired name entries with bad hash"
4810 [ $repaired -ge 1 ] ||
4811 error "(5) Fail to repair bad name hash: $repaired"
4813 umount_client $MOUNT || error "(6) umount failed"
4814 mount_client $MOUNT || error "(7) mount failed"
4816 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4817 stat $DIR/$tdir/striped_dir/d$i ||
4818 error "(8) Fail to stat d$i after LFSCK"
4819 rmdir $DIR/$tdir/striped_dir/d$i ||
4820 error "(9) Fail to unlink d$i after LFSCK"
4823 rmdir $DIR/$tdir/striped_dir ||
4824 error "(10) Fail to remove the striped directory after LFSCK"
4826 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4829 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4832 echo "For some reason, the master MDT-object of the striped directory"
4833 echo "may lost its master LMV EA. If nobody created files under the"
4834 echo "master directly after the master LMV EA lost, then the LFSCK"
4835 echo "should re-generate the master LMV EA."
4838 check_mount_and_prep
4840 echo "Inject failure stub on MDT0 to simulate the case that the"
4841 echo "master MDT-object of the striped directory lost the LMV EA."
4843 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4844 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4845 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4846 error "(1) Fail to create striped directory"
4847 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4849 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4850 $START_NAMESPACE -r -A ||
4851 error "(2) Fail to start LFSCK for namespace"
4853 wait_all_targets_blocked namespace completed 3
4855 local repaired=$($SHOW_NAMESPACE |
4856 awk '/^striped_dirs_repaired/ { print $2 }')
4857 [ $repaired -eq 1 ] ||
4858 error "(4) Fail to re-generate master LMV EA: $repaired"
4860 umount_client $MOUNT || error "(5) umount failed"
4861 mount_client $MOUNT || error "(6) mount failed"
4863 local empty=$(ls $DIR/$tdir/striped_dir/)
4864 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4866 rmdir $DIR/$tdir/striped_dir ||
4867 error "(8) Fail to remove the striped directory after LFSCK"
4869 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4872 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4875 echo "For some reason, the master MDT-object of the striped directory"
4876 echo "may lost its master LMV EA. If somebody created files under the"
4877 echo "master directly after the master LMV EA lost, then the LFSCK"
4878 echo "should NOT re-generate the master LMV EA, instead, it should"
4879 echo "change the broken striped dirctory as read-only to prevent"
4880 echo "further damage"
4883 check_mount_and_prep
4885 echo "Inject failure stub on MDT0 to simulate the case that the"
4886 echo "master MDT-object of the striped directory lost the LMV EA."
4888 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4890 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4891 error "(1) Fail to create striped directory"
4892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4894 umount_client $MOUNT || error "(2) umount failed"
4895 mount_client $MOUNT || error "(3) mount failed"
4897 touch $DIR/$tdir/striped_dir/dummy ||
4898 error "(4) Fail to touch under broken striped directory"
4900 echo "Trigger namespace LFSCK to find out the inconsistency"
4901 $START_NAMESPACE -r -A ||
4902 error "(5) Fail to start LFSCK for namespace"
4904 wait_all_targets_blocked namespace completed 6
4906 local repaired=$($SHOW_NAMESPACE |
4907 awk '/^striped_dirs_repaired/ { print $2 }')
4908 [ $repaired -eq 0 ] ||
4909 error "(7) Re-generate master LMV EA unexpected: $repaired"
4911 stat $DIR/$tdir/striped_dir/dummy ||
4912 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4914 touch $DIR/$tdir/striped_dir/foo &&
4915 error "(9) The broken striped directory should be read-only"
4917 chattr -i $DIR/$tdir/striped_dir ||
4918 error "(10) Fail to chattr on the broken striped directory"
4920 rmdir $DIR/$tdir/striped_dir ||
4921 error "(11) Fail to remove the striped directory after LFSCK"
4923 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4926 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4929 echo "For some reason, the slave MDT-object of the striped directory"
4930 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4931 echo "slave LMV EA."
4934 check_mount_and_prep
4936 echo "Inject failure stub on MDT0 to simulate the case that the"
4937 echo "slave MDT-object (that resides on the same MDT as the master"
4938 echo "MDT-object resides on) lost the LMV EA."
4940 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4942 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4943 error "(1) Fail to create striped directory"
4944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4946 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4947 $START_NAMESPACE -r -A ||
4948 error "(2) Fail to start LFSCK for namespace"
4950 wait_all_targets_blocked namespace completed 3
4952 local repaired=$($SHOW_NAMESPACE |
4953 awk '/^striped_shards_repaired/ { print $2 }')
4954 [ $repaired -eq 1 ] ||
4955 error "(4) Fail to re-generate slave LMV EA: $repaired"
4957 rmdir $DIR/$tdir/striped_dir ||
4958 error "(5) Fail to remove the striped directory after LFSCK"
4960 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4963 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4966 echo "For some reason, the slave MDT-object of the striped directory"
4967 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4968 echo "slave LMV EA."
4971 check_mount_and_prep
4973 echo "Inject failure stub on MDT0 to simulate the case that the"
4974 echo "slave MDT-object (that resides on different MDT as the master"
4975 echo "MDT-object resides on) lost the LMV EA."
4977 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4978 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4979 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4980 error "(1) Fail to create striped directory"
4981 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4983 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4984 $START_NAMESPACE -r -A ||
4985 error "(2) Fail to start LFSCK for namespace"
4987 wait_all_targets_blocked namespace completed 3
4989 local repaired=$(do_facet mds2 $LCTL get_param -n \
4990 mdd.$(facet_svc mds2).lfsck_namespace |
4991 awk '/^striped_shards_repaired/ { print $2 }')
4992 [ $repaired -eq 1 ] ||
4993 error "(4) Fail to re-generate slave LMV EA: $repaired"
4995 rmdir $DIR/$tdir/striped_dir ||
4996 error "(5) Fail to remove the striped directory after LFSCK"
4998 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5001 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5004 echo "For some reason, the stripe index in the slave LMV EA is"
5005 echo "corrupted. The LFSCK should repair the slave LMV EA."
5008 check_mount_and_prep
5010 echo "Inject failure stub on MDT0 to simulate the case that the"
5011 echo "slave LMV EA on the first shard of the striped directory"
5012 echo "claims the same index as the second shard claims"
5014 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5015 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5016 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5017 error "(1) Fail to create striped directory"
5018 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5020 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5021 $START_NAMESPACE -r -A ||
5022 error "(2) Fail to start LFSCK for namespace"
5024 wait_all_targets_blocked namespace completed 3
5026 local repaired=$($SHOW_NAMESPACE |
5027 awk '/^striped_shards_repaired/ { print $2 }')
5028 [ $repaired -eq 1 ] ||
5029 error "(4) Fail to repair slave LMV EA: $repaired"
5031 umount_client $MOUNT || error "(5) umount failed"
5032 mount_client $MOUNT || error "(6) mount failed"
5034 touch $DIR/$tdir/striped_dir/foo ||
5035 error "(7) Fail to touch file after the LFSCK"
5037 rm -f $DIR/$tdir/striped_dir/foo ||
5038 error "(8) Fail to unlink file after the LFSCK"
5040 rmdir $DIR/$tdir/striped_dir ||
5041 error "(9) Fail to remove the striped directory after LFSCK"
5043 run_test 31g "Repair the corrupted slave LMV EA"
5046 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5049 echo "For some reason, the shard's name entry in the striped"
5050 echo "directory may be corrupted. The LFSCK should repair the"
5051 echo "bad shard's name entry."
5054 check_mount_and_prep
5056 echo "Inject failure stub on MDT0 to simulate the case that the"
5057 echo "first shard's name entry in the striped directory claims"
5058 echo "the same index as the second shard's name entry claims."
5060 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5061 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5062 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5063 error "(1) Fail to create striped directory"
5064 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5066 echo "Trigger namespace LFSCK to repair the shard's name entry"
5067 $START_NAMESPACE -r -A ||
5068 error "(2) Fail to start LFSCK for namespace"
5070 wait_all_targets_blocked namespace completed 3
5072 local repaired=$($SHOW_NAMESPACE |
5073 awk '/^dirent_repaired/ { print $2 }')
5074 [ $repaired -eq 1 ] ||
5075 error "(4) Fail to repair shard's name entry: $repaired"
5077 umount_client $MOUNT || error "(5) umount failed"
5078 mount_client $MOUNT || error "(6) mount failed"
5080 touch $DIR/$tdir/striped_dir/foo ||
5081 error "(7) Fail to touch file after the LFSCK"
5083 rm -f $DIR/$tdir/striped_dir/foo ||
5084 error "(8) Fail to unlink file after the LFSCK"
5086 rmdir $DIR/$tdir/striped_dir ||
5087 error "(9) Fail to remove the striped directory after LFSCK"
5089 run_test 31h "Repair the corrupted shard's name entry"
5094 umount_client $MOUNT
5096 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5097 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5098 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5100 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5101 [ "$STATUS" == "scanning-phase1" ] ||
5102 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5105 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5111 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5113 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5114 error "(5) Fail to start ost1"
5116 run_test 32a "stop LFSCK when some OST failed"
5120 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5123 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5124 error "(1) Fail to create $DIR/$tdir/dp"
5125 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5126 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5127 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5128 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5129 umount_client $MOUNT
5131 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5132 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5133 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5135 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5136 mdd.${MDT_DEV}.lfsck_namespace |
5137 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5139 error "(5) unexpected status"
5143 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5145 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5149 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5151 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5152 error "(8) Fail to start MDT2"
5154 run_test 32b "stop LFSCK when some MDT failed"
5160 $START_LAYOUT --dryrun -o -r ||
5161 error "(1) Fail to start layout LFSCK"
5162 wait_all_targets_blocked layout completed 2
5164 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5165 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5166 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5168 $START_NAMESPACE -e abort -A -r ||
5169 error "(4) Fail to start namespace LFSCK"
5170 wait_all_targets_blocked namespace completed 5
5172 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5173 [ "$PARAMS" == "failout,all_targets" ] ||
5174 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5176 run_test 33 "check LFSCK paramters"
5180 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5181 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5182 skip "Only valid for ZFS backend" && return
5186 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5188 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5189 error "(1) Fail to create $DIR/$tdir/dummy"
5191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5192 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5193 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5194 mdd.${MDT_DEV}.lfsck_namespace |
5195 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5197 error "(3) unexpected status"
5200 local repaired=$($SHOW_NAMESPACE |
5201 awk '/^dirent_repaired/ { print $2 }')
5202 [ $repaired -eq 1 ] ||
5203 error "(4) Fail to repair the lost agent object: $repaired"
5205 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5206 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5207 mdd.${MDT_DEV}.lfsck_namespace |
5208 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5210 error "(6) unexpected status"
5213 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5214 [ $repaired -eq 0 ] ||
5215 error "(7) Unexpected repairing: $repaired"
5217 run_test 34 "LFSCK can rebuild the lost agent object"
5221 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5225 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5226 do_facet mds2 $LCTL set_param fail_loc=0x1631
5227 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5228 error "(1) Fail to create $DIR/$tdir/dummy"
5231 do_facet mds2 $LCTL set_param fail_loc=0
5232 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5233 wait_update_facet mds2 "$LCTL get_param -n \
5234 mdd.$(facet_svc mds2).lfsck_namespace |
5235 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5236 error "(3) MDS${k} is not the expected 'completed'"
5238 local repaired=$(do_facet mds2 $LCTL get_param -n \
5239 mdd.$(facet_svc mds2).lfsck_namespace |
5240 awk '/^agent_entries_repaired/ { print $2 }')
5241 [ $repaired -eq 1 ] ||
5242 error "(4) Fail to repair the lost agent entry: $repaired"
5244 echo "stopall to cleanup object cache"
5247 setupall > /dev/null
5249 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5250 wait_update_facet mds2 "$LCTL get_param -n \
5251 mdd.$(facet_svc mds2).lfsck_namespace |
5252 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5253 error "(6) MDS${k} is not the expected 'completed'"
5255 repaired=$(do_facet mds2 $LCTL get_param -n \
5256 mdd.$(facet_svc mds2).lfsck_namespace |
5257 awk '/^agent_entries_repaired/ { print $2 }')
5258 [ $repaired -eq 0 ] ||
5259 error "(7) Unexpected repairing: $repaired"
5261 run_test 35 "LFSCK can rebuild the lost agent entry"
5264 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5267 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5268 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5269 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5272 check_mount_and_prep
5276 lctl get_param osc.*.*grant*
5277 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5279 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5280 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5281 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5282 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5283 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5284 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5285 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5286 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5287 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5289 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5290 error "(3) Fail to write $DIR/$tdir/f0"
5291 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5292 error "(4) Fail to write $DIR/$tdir/f1"
5293 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5294 error "(5) Fail to write $DIR/$tdir/f2"
5296 $LFS mirror resync $DIR/$tdir/f0 ||
5297 error "(6) Fail to resync $DIR/$tdir/f0"
5298 $LFS mirror resync $DIR/$tdir/f1 ||
5299 error "(7) Fail to resync $DIR/$tdir/f1"
5300 $LFS mirror resync $DIR/$tdir/f2 ||
5301 error "(8) Fail to resync $DIR/$tdir/f2"
5303 cancel_lru_locks mdc
5304 cancel_lru_locks osc
5306 $LFS getstripe $DIR/$tdir/f0 ||
5307 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5308 $LFS getstripe $DIR/$tdir/f1 ||
5309 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5310 $LFS getstripe $DIR/$tdir/f2 ||
5311 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5313 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5314 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5315 do_facet mds1 $LCTL set_param fail_loc=0x1616
5317 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5318 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5319 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5320 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5321 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5322 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5326 do_facet mds1 $LCTL set_param fail_loc=0
5328 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5329 error "(15) The 1st of mirror is not destroyed"
5330 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5331 error "(16) The 2nd of mirror is not destroyed"
5332 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5333 error "(17) The 3rd of mirror is not destroyed"
5337 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5338 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5339 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5340 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5341 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5342 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5344 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5345 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5347 for k in $(seq $MDSCOUNT); do
5348 # The LFSCK status query internal is 30 seconds. For the case
5349 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5350 # time to guarantee the status sync up.
5351 wait_update_facet mds${k} "$LCTL get_param -n \
5352 mdd.$(facet_svc mds${k}).lfsck_layout |
5353 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5354 error "(22) MDS${k} is not the expected 'completed'"
5357 for k in $(seq $OSTCOUNT); do
5358 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5359 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5360 awk '/^status/ { print $2 }')
5361 [ "$cur_status" == "completed" ] ||
5362 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5365 local repaired=$(do_facet mds1 $LCTL get_param -n \
5366 mdd.$(facet_svc mds1).lfsck_layout |
5367 awk '/^repaired_orphan/ { print $2 }')
5368 [ $repaired -eq 9 ] ||
5369 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5371 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5372 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5373 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5374 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5375 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5376 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5378 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5379 $LFS getstripe $DIR/$tdir/f0
5380 error "(28) The 1st of mirror is not recovered"
5383 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5384 $LFS getstripe $DIR/$tdir/f1
5385 error "(29) The 2nd of mirror is not recovered"
5388 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5389 $LFS getstripe $DIR/$tdir/f2
5390 error "(30) The 3rd of mirror is not recovered"
5393 run_test 36a "rebuild LOV EA for mirrored file (1)"
5396 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5397 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5400 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5401 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5402 echo "with the PFID EA of related OST-object(s) belong to the file. "
5405 check_mount_and_prep
5407 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5408 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5409 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5411 local fid=$($LFS path2fid $DIR/$tdir/f0)
5413 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5414 error "(1) Fail to write $DIR/$tdir/f0"
5415 $LFS mirror resync $DIR/$tdir/f0 ||
5416 error "(2) Fail to resync $DIR/$tdir/f0"
5418 cancel_lru_locks mdc
5419 cancel_lru_locks osc
5421 $LFS getstripe $DIR/$tdir/f0 ||
5422 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5424 echo "Inject failure, to simulate the case of missing the MDT-object"
5425 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5426 do_facet mds1 $LCTL set_param fail_loc=0x1616
5427 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5431 do_facet mds1 $LCTL set_param fail_loc=0
5433 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5434 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5436 for k in $(seq $MDSCOUNT); do
5437 # The LFSCK status query internal is 30 seconds. For the case
5438 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5439 # time to guarantee the status sync up.
5440 wait_update_facet mds${k} "$LCTL get_param -n \
5441 mdd.$(facet_svc mds${k}).lfsck_layout |
5442 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5443 error "(6) MDS${k} is not the expected 'completed'"
5446 for k in $(seq $OSTCOUNT); do
5447 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5448 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5449 awk '/^status/ { print $2 }')
5450 [ "$cur_status" == "completed" ] ||
5451 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5454 local count=$(do_facet mds1 $LCTL get_param -n \
5455 mdd.$(facet_svc mds1).lfsck_layout |
5456 awk '/^repaired_orphan/ { print $2 }')
5457 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5459 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5460 count=$($LFS getstripe --mirror-count $name)
5461 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5463 count=$($LFS getstripe --component-count $name)
5464 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5466 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5467 $LFS getstripe $name
5468 error "(11) The 1st of mirror is not recovered"
5471 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5472 $LFS getstripe $name
5473 error "(12) The 2nd of mirror is not recovered"
5476 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5477 $LFS getstripe $name
5478 error "(13) The 3rd of mirror is not recovered"
5481 run_test 36b "rebuild LOV EA for mirrored file (2)"
5484 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5485 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5488 echo "The mirrored file has been modified, not resynced yet, then "
5489 echo "lost its MDT-object, but relatd OST-objects are still there. "
5490 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5491 echo "with the PFID EA of related OST-object(s) belong to the file. "
5494 check_mount_and_prep
5496 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5498 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5500 local fid=$($LFS path2fid $DIR/$tdir/f0)
5502 # The 1st dd && resync makes all related OST-objects have been written
5503 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5504 error "(1.1) Fail to write $DIR/$tdir/f0"
5505 $LFS mirror resync $DIR/$tdir/f0 ||
5506 error "(1.2) Fail to resync $DIR/$tdir/f0"
5507 # The 2nd dd makes one mirror to be stale
5508 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5509 error "(1.3) Fail to write $DIR/$tdir/f0"
5511 cancel_lru_locks mdc
5512 cancel_lru_locks osc
5514 $LFS getstripe $DIR/$tdir/f0 ||
5515 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5517 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5518 awk '/lcme_flags/ { print $2 }')
5519 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5520 awk '/lcme_flags/ { print $2 }')
5522 echo "Inject failure, to simulate the case of missing the MDT-object"
5523 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5524 do_facet mds1 $LCTL set_param fail_loc=0x1616
5525 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5529 do_facet mds1 $LCTL set_param fail_loc=0
5531 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5532 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5534 for k in $(seq $MDSCOUNT); do
5535 # The LFSCK status query internal is 30 seconds. For the case
5536 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5537 # time to guarantee the status sync up.
5538 wait_update_facet mds${k} "$LCTL get_param -n \
5539 mdd.$(facet_svc mds${k}).lfsck_layout |
5540 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5541 error "(5) MDS${k} is not the expected 'completed'"
5544 for k in $(seq $OSTCOUNT); do
5545 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5546 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5547 awk '/^status/ { print $2 }')
5548 [ "$cur_status" == "completed" ] ||
5549 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5552 local count=$(do_facet mds1 $LCTL get_param -n \
5553 mdd.$(facet_svc mds1).lfsck_layout |
5554 awk '/^repaired_orphan/ { print $2 }')
5555 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5557 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5558 count=$($LFS getstripe --mirror-count $name)
5559 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5561 count=$($LFS getstripe --component-count $name)
5562 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5564 local flags=$($LFS getstripe $name | head -n 10 |
5565 awk '/lcme_flags/ { print $2 }')
5566 [ "$flags" == "$saved_flags1" ] || {
5567 $LFS getstripe $name
5568 error "(10) expect flags $saved_flags1, got $flags"
5571 flags=$($LFS getstripe $name | tail -n 10 |
5572 awk '/lcme_flags/ { print $2 }')
5573 [ "$flags" == "$saved_flags2" ] || {
5574 $LFS getstripe $name
5575 error "(11) expect flags $saved_flags2, got $flags"
5578 run_test 36c "rebuild LOV EA for mirrored file (3)"
5584 local t_dir="$DIR/$tdir/d0"
5585 check_mount_and_prep
5587 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5588 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5592 $START_NAMESPACE -r -A || {
5593 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5595 wait_all_targets_blocked namespace completed 4
5600 run_test 37 "LFSCK must skip a ORPHAN"
5604 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5605 skip "Need MDS version newer than 2.12.51"
5607 test_mkdir $DIR/$tdir
5608 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5609 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5611 # create foreign file
5612 $LFS setstripe --foreign=daos --flags 0xda05 \
5613 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5614 error "$DIR/$tdir/$tfile: create failed"
5616 $LFS getstripe -v $DIR/$tdir/$tfile |
5617 grep "lfm_magic:.*0x0BD70BD0" ||
5618 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5619 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5620 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5621 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5622 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5623 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5624 $LFS getstripe -v $DIR/$tdir/$tfile |
5625 grep "lfm_flags:.*0x0000DA05" ||
5626 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5627 $LFS getstripe $DIR/$tdir/$tfile |
5628 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5629 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5631 # modify striping should fail
5632 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5633 error "$DIR/$tdir/$tfile: setstripe should fail"
5635 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5637 wait_all_targets_blocked namespace completed 1
5639 # check that "global" namespace_repaired == 0 !!!
5640 local repaired=$(do_facet mds1 \
5641 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5642 awk '/^namespace_repaired/ { print \\\$2 }'")
5643 [ $repaired -eq 0 ] ||
5644 error "(2) Expect no namespace repair, but got: $repaired"
5646 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5648 wait_all_targets_blocked layout completed 2
5650 # check that "global" layout_repaired == 0 !!!
5651 local repaired=$(do_facet mds1 \
5652 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5653 awk '/^layout_repaired/ { print \\\$2 }'")
5654 [ $repaired -eq 0 ] ||
5655 error "(2) Expect no layout repair, but got: $repaired"
5657 echo "post-lfsck checks of foreign file"
5659 $LFS getstripe -v $DIR/$tdir/$tfile |
5660 grep "lfm_magic:.*0x0BD70BD0" ||
5661 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5662 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5663 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5664 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5665 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5666 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5667 $LFS getstripe -v $DIR/$tdir/$tfile |
5668 grep "lfm_flags:.*0x0000DA05" ||
5669 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5670 $LFS getstripe $DIR/$tdir/$tfile |
5671 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5672 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5674 # modify striping should fail
5675 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5676 error "$DIR/$tdir/$tfile: setstripe should fail"
5679 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5680 cat /etc/passwd > $DIR/$tdir/$tfile &&
5681 error "$DIR/$tdir/$tfile: write should fail"
5683 #remove foreign file
5684 rm $DIR/$tdir/$tfile ||
5685 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5687 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5691 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5692 skip "Need MDS version newer than 2.12.51"
5694 test_mkdir $DIR/$tdir
5695 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5696 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5698 # create foreign dir
5699 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5700 $DIR/$tdir/${tdir}2 ||
5701 error "$DIR/$tdir/${tdir}2: create failed"
5703 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5704 grep "lfm_magic:.*0x0CD50CD0" ||
5705 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5706 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5707 # - sizeof(lfm_type) - sizeof(lfm_flags)
5708 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5709 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5710 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5711 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5712 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5713 grep "lfm_flags:.*0x0000DA05" ||
5714 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5715 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5716 grep "lfm_value.*${uuid1}@${uuid2}" ||
5717 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5719 # file create in dir should fail
5720 touch $DIR/$tdir/${tdir}2/$tfile &&
5721 "$DIR/${tdir}2: file create should fail"
5724 chmod 777 $DIR/$tdir/${tdir}2 ||
5725 error "$DIR/${tdir}2: chmod failed"
5728 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5729 error "$DIR/${tdir}2: chown failed"
5731 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5733 wait_all_targets_blocked namespace completed 1
5735 # check that "global" namespace_repaired == 0 !!!
5736 local repaired=$(do_facet mds1 \
5737 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5738 awk '/^namespace_repaired/ { print \\\$2 }'")
5739 [ $repaired -eq 0 ] ||
5740 error "(2) Expect nothing to be repaired, but got: $repaired"
5742 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5744 wait_all_targets_blocked layout completed 2
5746 # check that "global" layout_repaired == 0 !!!
5747 local repaired=$(do_facet mds1 \
5748 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5749 awk '/^layout_repaired/ { print \\\$2 }'")
5750 [ $repaired -eq 0 ] ||
5751 error "(2) Expect no layout repair, but got: $repaired"
5753 echo "post-lfsck checks of foreign dir"
5755 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5756 grep "lfm_magic:.*0x0CD50CD0" ||
5757 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5758 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5759 # - sizeof(lfm_type) - sizeof(lfm_flags)
5760 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5761 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5762 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5763 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5764 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5765 grep "lfm_flags:.*0x0000DA05" ||
5766 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5767 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5768 grep "lfm_value.*${uuid1}@${uuid2}" ||
5769 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5771 # file create in dir should fail
5772 touch $DIR/$tdir/${tdir}2/$tfile &&
5773 "$DIR/${tdir}2: file create should fail"
5776 chmod 777 $DIR/$tdir/${tdir}2 ||
5777 error "$DIR/${tdir}2: chmod failed"
5780 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5781 error "$DIR/${tdir}2: chown failed"
5784 rmdir $DIR/$tdir/${tdir}2 ||
5785 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5787 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5790 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5792 check_mount_and_prep
5793 $LFS mkdir -i 1 $DIR/$tdir/dir1
5794 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5796 touch $DIR/$tdir/dir1/f1
5797 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5799 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5800 $LFS migrate -m 0 $DIR/$tdir/dir1
5802 echo "trigger LFSCK for layout"
5803 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5806 mdd.${MDT_DEV}.lfsck_layout |
5807 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5809 error "(2) unexpected status"
5812 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5814 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5816 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5818 # restore MDS/OST size
5819 MDSSIZE=${SAVED_MDSSIZE}
5820 OSTSIZE=${SAVED_OSTSIZE}
5821 OSTCOUNT=${SAVED_OSTCOUNT}
5823 # cleanup the system at last
5824 REFORMAT="yes" cleanup_and_setup_lustre
5827 check_and_cleanup_lustre