3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # DNE does not support striped directory on zfs-based backend yet.
19 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
20 #Bug number for excepting test
22 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
24 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
25 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
27 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
28 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
30 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
31 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
33 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
36 require_dsh_mds || exit 0
40 if ! check_versions; then
41 skip "It is NOT necessary to test lfsck under interoperation mode"
45 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
46 skip "Need MDS version at least 2.3.60" && exit 0
50 SAVED_MDSSIZE=${MDSSIZE}
51 SAVED_OSTSIZE=${OSTSIZE}
52 SAVED_OSTCOUNT=${OSTCOUNT}
53 # use small MDS + OST size to speed formatting time
54 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
56 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
58 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
60 # no need too many OSTs, to reduce the format/start/stop overhead
62 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
64 # build up a clean test environment.
65 REFORMAT="yes" check_and_setup_lustre
67 MDT_DEV="${FSNAME}-MDT0000"
68 OST_DEV="${FSNAME}-OST0000"
69 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
70 START_NAMESPACE="do_facet $SINGLEMDS \
71 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
72 START_LAYOUT="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
74 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
75 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
76 SHOW_NAMESPACE="do_facet $SINGLEMDS \
77 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
78 SHOW_LAYOUT="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
80 SHOW_LAYOUT_ON_OST="do_facet ost1 \
81 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
82 MOUNT_OPTS_SCRUB="-o user_xattr"
83 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
84 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
93 echo "preparing... $nfiles * $ndirs files will be created $(date)."
94 if [ ! -z $igif ]; then
95 #define OBD_FAIL_FID_IGIF 0x1504
96 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
99 cp $LUSTRE/tests/*.sh $DIR/$tdir/
100 if [ $ndirs -gt 0 ]; then
101 createmany -d $DIR/$tdir/d $ndirs
102 createmany -m $DIR/$tdir/f $ndirs
103 if [ $nfiles -gt 0 ]; then
104 for ((i = 0; i < $ndirs; i++)); do
105 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
106 /dev/null || error "createmany $nfiles"
109 createmany -d $DIR/$tdir/e $ndirs
112 if [ ! -z $igif ]; then
113 touch $DIR/$tdir/dummy
114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
117 echo "prepared $(date)."
120 run_e2fsck_on_mdt0() {
121 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
123 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
124 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
127 error "(2) Detected inconsistency on MDT0"
129 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
130 error "(3) Fail to start MDT0"
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS" && return
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
345 touch $DIR/$tdir/dummy
347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
349 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
350 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
351 mdd.${MDT_DEV}.lfsck_namespace |
352 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
354 error "(4) unexpected status"
357 local repaired=$($SHOW_NAMESPACE |
358 awk '/^linkea_repaired/ { print $2 }')
359 # for interop with old server
360 [ -z "$repaired" ] &&
361 repaired=$($SHOW_NAMESPACE |
362 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
422 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^updated_phase2/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
459 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
460 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
461 touch $DIR/$tdir/dummy
463 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
465 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
466 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
467 mdd.${MDT_DEV}.lfsck_namespace |
468 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
470 error "(4) unexpected status"
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
480 mount_client $MOUNT || error "(6) Fail to start client!"
482 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
483 error "(7) Fail to stat $DIR/$tdir/dummy"
485 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
486 local dummyname=$($LFS fid2path $DIR $dummyfid)
487 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
488 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
490 run_test 2d "LFSCK can recover the missing linkEA entry"
494 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
498 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
500 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
502 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
505 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
507 wait_all_targets_blocked namespace completed 4
509 local repaired=$($SHOW_NAMESPACE |
510 awk '/^linkea_repaired/ { print $2 }')
511 [ $repaired -eq 1 ] ||
512 error "(5) Fail to repair crashed linkEA: $repaired"
514 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
515 local name=$($LFS fid2path $DIR $fid)
516 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
517 error "(6) Fail to repair linkEA: $fid $name"
519 run_test 2e "namespace LFSCK can verify remote object linkEA"
525 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
526 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
527 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
529 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
530 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
531 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
533 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
534 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
535 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
537 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
539 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
543 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
548 error "(10) unexpected status"
551 local checked=$($SHOW_NAMESPACE |
552 awk '/^checked_phase2/ { print $2 }')
553 [ $checked -ge 4 ] ||
554 error "(11) Fail to check multiple-linked object: $checked"
556 local repaired=$($SHOW_NAMESPACE |
557 awk '/^multiple_linked_repaired/ { print $2 }')
558 [ $repaired -ge 2 ] ||
559 error "(12) Fail to repair multiple-linked object: $repaired"
561 run_test 3 "LFSCK can verify multiple-linked objects"
565 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
566 skip "OI Scrub not implemented for ZFS" && return
569 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
570 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
572 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
573 echo "start $SINGLEMDS with disabling OI scrub"
574 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
575 error "(2) Fail to start MDS!"
577 #define OBD_FAIL_LFSCK_DELAY2 0x1601
578 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
579 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
580 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
581 mdd.${MDT_DEV}.lfsck_namespace |
582 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
584 error "(5) unexpected status"
587 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
588 [ "$STATUS" == "scanning-phase1" ] ||
589 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
593 mdd.${MDT_DEV}.lfsck_namespace |
594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
596 error "(7) unexpected status"
599 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
600 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
602 local repaired=$($SHOW_NAMESPACE |
603 awk '/^dirent_repaired/ { print $2 }')
604 # for interop with old server
605 [ -z "$repaired" ] &&
606 repaired=$($SHOW_NAMESPACE |
607 awk '/^updated_phase1/ { print $2 }')
609 [ $repaired -ge 9 ] ||
610 error "(9) Fail to re-generate FID-in-dirent: $repaired"
614 mount_client $MOUNT || error "(10) Fail to start client!"
616 #define OBD_FAIL_FID_LOOKUP 0x1505
617 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
618 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
621 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
625 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
626 skip "OI Scrub not implemented for ZFS" && return
629 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
630 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
632 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
633 echo "start $SINGLEMDS with disabling OI scrub"
634 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
635 error "(2) Fail to start MDS!"
637 #define OBD_FAIL_LFSCK_DELAY2 0x1601
638 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
639 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
640 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
641 mdd.${MDT_DEV}.lfsck_namespace |
642 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
644 error "(5) unexpected status"
647 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
648 [ "$STATUS" == "scanning-phase1" ] ||
649 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
651 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
652 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
653 mdd.${MDT_DEV}.lfsck_namespace |
654 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
656 error "(7) unexpected status"
659 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
660 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
662 local repaired=$($SHOW_NAMESPACE |
663 awk '/^dirent_repaired/ { print $2 }')
664 # for interop with old server
665 [ -z "$repaired" ] &&
666 repaired=$($SHOW_NAMESPACE |
667 awk '/^updated_phase1/ { print $2 }')
669 [ $repaired -ge 2 ] ||
670 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
674 mount_client $MOUNT || error "(10) Fail to start client!"
676 #define OBD_FAIL_FID_LOOKUP 0x1505
677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
678 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
680 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
684 local dummyname=$($LFS fid2path $DIR $dummyfid)
685 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
686 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
688 run_test 5 "LFSCK can handle IGIF object upgrading"
693 #define OBD_FAIL_LFSCK_DELAY1 0x1600
694 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
695 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
697 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
698 [ "$STATUS" == "scanning-phase1" ] ||
699 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
701 # Sleep 3 sec to guarantee at least one object processed by LFSCK
703 # Fail the LFSCK to guarantee there is at least one checkpoint
704 #define OBD_FAIL_LFSCK_FATAL1 0x1608
705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
707 mdd.${MDT_DEV}.lfsck_namespace |
708 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
710 error "(4) unexpected status"
713 local POS0=$($SHOW_NAMESPACE |
714 awk '/^last_checkpoint_position/ { print $2 }' |
717 #define OBD_FAIL_LFSCK_DELAY1 0x1600
718 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
719 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
721 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
722 [ "$STATUS" == "scanning-phase1" ] ||
723 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
725 local POS1=$($SHOW_NAMESPACE |
726 awk '/^latest_start_position/ { print $2 }' |
728 [[ $POS0 -lt $POS1 ]] ||
729 error "(7) Expect larger than: $POS0, but got $POS1"
731 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
732 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
733 mdd.${MDT_DEV}.lfsck_namespace |
734 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
736 error "(8) unexpected status"
739 run_test 6a "LFSCK resumes from last checkpoint (1)"
744 #define OBD_FAIL_LFSCK_DELAY2 0x1601
745 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
746 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
748 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
749 [ "$STATUS" == "scanning-phase1" ] ||
750 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
752 # Sleep 5 sec to guarantee that we are in the directory scanning
754 # Fail the LFSCK to guarantee there is at least one checkpoint
755 #define OBD_FAIL_LFSCK_FATAL2 0x1609
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
761 error "(4) unexpected status"
764 local O_POS0=$($SHOW_NAMESPACE |
765 awk '/^last_checkpoint_position/ { print $2 }' |
768 local D_POS0=$($SHOW_NAMESPACE |
769 awk '/^last_checkpoint_position/ { print $4 }')
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
775 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
779 local O_POS1=$($SHOW_NAMESPACE |
780 awk '/^latest_start_position/ { print $2 }' |
782 local D_POS1=$($SHOW_NAMESPACE |
783 awk '/^latest_start_position/ { print $4 }')
785 echo "Additional debug for 6b"
787 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
788 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
789 [[ $O_POS0 -lt $O_POS1 ]] ||
790 error "(7.1) $O_POS1 is not larger than $O_POS0"
792 [[ $D_POS0 -lt $D_POS1 ]] ||
793 error "(7.2) $D_POS1 is not larger than $D_POS0"
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
797 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
798 mdd.${MDT_DEV}.lfsck_namespace |
799 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
801 error "(8) unexpected status"
804 run_test 6b "LFSCK resumes from last checkpoint (2)"
811 #define OBD_FAIL_LFSCK_DELAY2 0x1601
812 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
813 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
815 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
816 [ "$STATUS" == "scanning-phase1" ] ||
817 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
819 # Sleep 3 sec to guarantee at least one object processed by LFSCK
821 echo "stop $SINGLEMDS"
822 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
825 echo "start $SINGLEMDS"
826 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
827 error "(5) Fail to start MDS!"
829 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
830 mdd.${MDT_DEV}.lfsck_namespace |
831 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
833 error "(6) unexpected status"
836 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
842 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
844 for ((i = 0; i < 20; i++)); do
845 touch $DIR/$tdir/dummy${i}
848 #define OBD_FAIL_LFSCK_DELAY3 0x1602
849 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
850 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
851 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
852 mdd.${MDT_DEV}.lfsck_namespace |
853 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
855 error "(4) unexpected status"
859 echo "stop $SINGLEMDS"
860 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
862 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
863 echo "start $SINGLEMDS"
864 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
865 error "(6) Fail to start MDS!"
867 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
868 mdd.${MDT_DEV}.lfsck_namespace |
869 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
871 error "(7) unexpected status"
874 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
879 formatall > /dev/null
885 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
886 [ "$STATUS" == "init" ] ||
887 error "(2) Expect 'init', but got '$STATUS'"
889 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
891 mkdir $DIR/$tdir/crashed
893 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
895 for ((i = 0; i < 5; i++)); do
896 touch $DIR/$tdir/dummy${i}
899 umount_client $MOUNT || error "(3) Fail to stop client!"
901 #define OBD_FAIL_LFSCK_DELAY2 0x1601
902 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
903 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
905 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
906 [ "$STATUS" == "scanning-phase1" ] ||
907 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
909 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
911 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
912 [ "$STATUS" == "stopped" ] ||
913 error "(7) Expect 'stopped', but got '$STATUS'"
915 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
917 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
918 [ "$STATUS" == "scanning-phase1" ] ||
919 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
921 #define OBD_FAIL_LFSCK_FATAL2 0x1609
922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
927 error "(10) unexpected status"
930 #define OBD_FAIL_LFSCK_DELAY1 0x1600
931 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
932 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
934 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
935 [ "$STATUS" == "scanning-phase1" ] ||
936 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
938 #define OBD_FAIL_LFSCK_CRASH 0x160a
939 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
942 echo "stop $SINGLEMDS"
943 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
945 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
948 echo "start $SINGLEMDS"
949 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
950 error "(14) Fail to start MDS!"
952 local timeout=$(max_recovery_time)
955 while [ $timer -lt $timeout ]; do
956 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
957 mdt.${MDT_DEV}.recovery_status |
958 awk '/^status/ { print \\\$2 }'")
959 [ "$STATUS" != "RECOVERING" ] && break;
964 [ $timer != $timeout ] ||
965 error "(14.1) recovery timeout"
967 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
968 [ "$STATUS" == "crashed" ] ||
969 error "(15) Expect 'crashed', but got '$STATUS'"
971 #define OBD_FAIL_LFSCK_DELAY2 0x1601
972 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
973 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
975 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
976 [ "$STATUS" == "scanning-phase1" ] ||
977 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
979 echo "stop $SINGLEMDS"
980 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
982 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
985 echo "start $SINGLEMDS"
986 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
987 error "(19) Fail to start MDS!"
990 while [ $timer -lt $timeout ]; do
991 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
992 mdt.${MDT_DEV}.recovery_status |
993 awk '/^status/ { print \\\$2 }'")
994 [ "$STATUS" != "RECOVERING" ] && break;
999 [ $timer != $timeout ] ||
1000 error "(19.1) recovery timeout"
1002 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1003 [ "$STATUS" == "paused" ] ||
1004 error "(20) Expect 'paused', but got '$STATUS'"
1006 echo "stop $SINGLEMDS"
1007 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1009 echo "start $SINGLEMDS without resume LFSCK"
1010 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1011 error "(20.2) Fail to start MDS!"
1014 while [ $timer -lt $timeout ]; do
1015 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1016 mdt.${MDT_DEV}.recovery_status |
1017 awk '/^status/ { print \\\$2 }'")
1018 [ "$STATUS" != "RECOVERING" ] && break;
1020 timer=$((timer + 1))
1023 [ $timer != $timeout ] ||
1024 error "(20.3) recovery timeout"
1026 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1027 [ "$STATUS" == "paused" ] ||
1028 error "(20.4) Expect 'paused', but got '$STATUS'"
1030 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1031 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1033 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1034 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1035 mdd.${MDT_DEV}.lfsck_namespace |
1036 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1038 error "(22) unexpected status"
1041 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1042 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1043 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1045 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1046 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1047 mdd.${MDT_DEV}.lfsck_namespace |
1048 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1050 error "(24) unexpected status"
1053 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1054 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1056 run_test 8 "LFSCK state machine"
1059 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1060 skip "Testing on UP system, the speed may be inaccurate."
1064 check_mount_and_prep
1065 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1066 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1067 createmany -o $DIR/$tdir/lfsck/f 5000
1069 local BASE_SPEED1=100
1071 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1074 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1075 [ "$STATUS" == "scanning-phase1" ] ||
1076 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1078 local SPEED=$($SHOW_LAYOUT |
1079 awk '/^average_speed_phase1/ { print $2 }')
1081 # There may be time error, normally it should be less than 2 seconds.
1082 # We allow another 20% schedule error.
1084 # MAX_MARGIN = 1.3 = 13 / 10
1085 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1086 RUN_TIME1 * 13 / 10))
1087 [ $SPEED -lt $MAX_SPEED ] || {
1089 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1090 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1093 # adjust speed limit
1094 local BASE_SPEED2=300
1096 do_facet $SINGLEMDS \
1097 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1100 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1101 # MIN_MARGIN = 0.7 = 7 / 10
1102 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1103 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1104 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1105 [ $SPEED -gt $MIN_SPEED ] || {
1106 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1107 error_ignore LU-5624 \
1108 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1111 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1115 # MAX_MARGIN = 1.3 = 13 / 10
1116 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1117 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1118 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1119 [ $SPEED -lt $MAX_SPEED ] || {
1121 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1122 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1123 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1126 do_nodes $(comma_list $(mdts_nodes)) \
1127 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1128 do_nodes $(comma_list $(osts_nodes)) \
1129 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1131 wait_update_facet $SINGLEMDS \
1132 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1133 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1134 error "(7) Failed to get expected 'completed'"
1136 run_test 9a "LFSCK speed control (1)"
1139 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1140 skip "Testing on UP system, the speed may be inaccurate."
1146 echo "Preparing another 50 * 50 files (with error) at $(date)."
1147 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1148 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1149 createmany -d $DIR/$tdir/d 50
1150 createmany -m $DIR/$tdir/f 50
1151 for ((i = 0; i < 50; i++)); do
1152 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1155 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1157 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1158 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1159 mdd.${MDT_DEV}.lfsck_namespace |
1160 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1162 error "(5) unexpected status"
1165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1166 echo "Prepared at $(date)."
1168 local BASE_SPEED1=50
1170 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1173 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1174 [ "$STATUS" == "scanning-phase2" ] ||
1175 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1177 local SPEED=$($SHOW_NAMESPACE |
1178 awk '/^average_speed_phase2/ { print $2 }')
1179 # There may be time error, normally it should be less than 2 seconds.
1180 # We allow another 20% schedule error.
1182 # MAX_MARGIN = 1.3 = 13 / 10
1183 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1184 RUN_TIME1 * 13 / 10))
1185 [ $SPEED -lt $MAX_SPEED ] || {
1187 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1188 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1191 # adjust speed limit
1192 local BASE_SPEED2=150
1194 do_facet $SINGLEMDS \
1195 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1198 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1199 # MIN_MARGIN = 0.7 = 7 / 10
1200 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1201 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1202 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1203 [ $SPEED -gt $MIN_SPEED ] || {
1204 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1205 error_ignore LU-5624 \
1206 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1209 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1213 # MAX_MARGIN = 1.3 = 13 / 10
1214 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1215 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1216 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1217 [ $SPEED -lt $MAX_SPEED ] || {
1219 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1220 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1221 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1224 do_nodes $(comma_list $(mdts_nodes)) \
1225 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1226 do_nodes $(comma_list $(osts_nodes)) \
1227 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1229 mdd.${MDT_DEV}.lfsck_namespace |
1230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1232 error "(11) unexpected status"
1235 run_test 9b "LFSCK speed control (2)"
1239 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1240 skip "lookup(..)/linkea on ZFS issue" && return
1244 echo "Preparing more files with error at $(date)."
1245 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1246 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1248 for ((i = 0; i < 1000; i = $((i+2)))); do
1249 mkdir -p $DIR/$tdir/d${i}
1250 touch $DIR/$tdir/f${i}
1251 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1254 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1257 for ((i = 1; i < 1000; i = $((i+2)))); do
1258 mkdir -p $DIR/$tdir/d${i}
1259 touch $DIR/$tdir/f${i}
1260 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1264 echo "Prepared at $(date)."
1266 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1268 umount_client $MOUNT
1269 mount_client $MOUNT || error "(3) Fail to start client!"
1271 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1274 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1275 [ "$STATUS" == "scanning-phase1" ] ||
1276 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1278 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1280 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1282 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1284 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1286 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1288 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1290 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1292 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1293 error "(14) Fail to softlink!"
1295 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1296 [ "$STATUS" == "scanning-phase1" ] ||
1297 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1299 do_nodes $(comma_list $(mdts_nodes)) \
1300 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1301 do_nodes $(comma_list $(osts_nodes)) \
1302 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1303 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1304 mdd.${MDT_DEV}.lfsck_namespace |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(16) unexpected status"
1310 run_test 10 "System is available during LFSCK scanning"
1313 ost_remove_lastid() {
1316 local rcmd="do_facet ost${ost}"
1318 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1320 # step 1: local mount
1321 mount_fstype ost${ost} || return 1
1322 # step 2: remove the specified LAST_ID
1323 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1325 unmount_fstype ost${ost} || return 2
1329 check_mount_and_prep
1330 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1331 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1336 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1338 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1339 error "(2) Fail to start ost1"
1341 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1342 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1344 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1345 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1347 wait_update_facet ost1 "$LCTL get_param -n \
1348 obdfilter.${OST_DEV}.lfsck_layout |
1349 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1351 error "(5) unexpected status"
1354 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1356 wait_update_facet ost1 "$LCTL get_param -n \
1357 obdfilter.${OST_DEV}.lfsck_layout |
1358 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1360 error "(6) unexpected status"
1363 echo "the LAST_ID(s) should have been rebuilt"
1364 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1365 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1367 run_test 11a "LFSCK can rebuild lost last_id"
1370 check_mount_and_prep
1371 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1373 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1374 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1375 do_facet ost1 $LCTL set_param fail_loc=0x160d
1377 local count=$(precreated_ost_obj_count 0 0)
1379 createmany -o $DIR/$tdir/f $((count + 32))
1381 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1382 local seq=$(do_facet mds1 $LCTL get_param -n \
1383 osp.${proc_path}.prealloc_last_seq)
1384 local lastid1=$(do_facet ost1 "lctl get_param -n \
1385 obdfilter.${ost1_svc}.last_id" | grep $seq |
1386 awk -F: '{ print $2 }')
1388 umount_client $MOUNT
1389 stop ost1 || error "(1) Fail to stop ost1"
1391 #define OBD_FAIL_OST_ENOSPC 0x215
1392 do_facet ost1 $LCTL set_param fail_loc=0x215
1394 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1395 error "(2) Fail to start ost1"
1397 for ((i = 0; i < 60; i++)); do
1398 lastid2=$(do_facet ost1 "lctl get_param -n \
1399 obdfilter.${ost1_svc}.last_id" | grep $seq |
1400 awk -F: '{ print $2 }')
1401 [ ! -z $lastid2 ] && break;
1405 echo "the on-disk LAST_ID should be smaller than the expected one"
1406 [ $lastid1 -gt $lastid2 ] ||
1407 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1409 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1410 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1412 wait_update_facet ost1 "$LCTL get_param -n \
1413 obdfilter.${OST_DEV}.lfsck_layout |
1414 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1416 error "(6) unexpected status"
1419 stop ost1 || error "(7) Fail to stop ost1"
1421 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1422 error "(8) Fail to start ost1"
1424 echo "the on-disk LAST_ID should have been rebuilt"
1425 wait_update_facet ost1 "$LCTL get_param -n \
1426 obdfilter.${ost1_svc}.last_id | grep $seq |
1427 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1428 do_facet ost1 $LCTL get_param -n \
1429 obdfilter.${ost1_svc}.last_id
1430 error "(9) expect lastid1 $seq:$lastid1"
1433 do_facet ost1 $LCTL set_param fail_loc=0
1434 stopall || error "(10) Fail to stopall"
1436 run_test 11b "LFSCK can rebuild crashed last_id"
1439 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1441 check_mount_and_prep
1442 for k in $(seq $MDSCOUNT); do
1443 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1444 createmany -o $DIR/$tdir/${k}/f 100 ||
1445 error "(0) Fail to create 100 files."
1448 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1449 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1450 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1452 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1453 wait_all_targets namespace scanning-phase1 3
1455 echo "Stop namespace LFSCK on all targets by single lctl command."
1456 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1457 error "(4) Fail to stop LFSCK on all devices!"
1459 echo "All the LFSCK targets should be in 'stopped' status."
1460 wait_all_targets_blocked namespace stopped 5
1462 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1463 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1464 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1466 echo "All the LFSCK targets should be in 'completed' status."
1467 wait_all_targets_blocked namespace completed 7
1469 start_full_debug_logging
1471 echo "Start layout LFSCK on all targets by single command (-s 1)."
1472 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1473 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1475 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1476 wait_all_targets layout scanning-phase1 9
1478 echo "Stop layout LFSCK on all targets by single lctl command."
1479 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1480 error "(10) Fail to stop LFSCK on all devices!"
1482 echo "All the LFSCK targets should be in 'stopped' status."
1483 wait_all_targets_blocked layout stopped 11
1485 for k in $(seq $OSTCOUNT); do
1486 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1487 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1488 awk '/^status/ { print $2 }')
1489 [ "$STATUS" == "stopped" ] ||
1490 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1493 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1494 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1495 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1497 echo "All the LFSCK targets should be in 'completed' status."
1498 wait_all_targets_blocked layout completed 14
1500 stop_full_debug_logging
1502 run_test 12a "single command to trigger LFSCK on all devices"
1505 check_mount_and_prep
1507 echo "Start LFSCK without '-M' specified."
1508 do_facet mds1 $LCTL lfsck_start -A -r ||
1509 error "(0) Fail to start LFSCK without '-M'"
1511 wait_all_targets_blocked namespace completed 1
1512 wait_all_targets_blocked layout completed 2
1514 local count=$(do_facet mds1 $LCTL dl |
1515 awk '{ print $3 }' | grep mdt | wc -l)
1516 if [ $count -gt 1 ]; then
1518 echo "Start layout LFSCK on the node with multipe targets,"
1519 echo "but not specify '-M'/'-A' option. Should get failure."
1521 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1522 error "(3) Start layout LFSCK should fail" || true
1525 run_test 12b "auto detect Lustre device"
1529 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1530 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1531 echo "MDT-object FID."
1534 check_mount_and_prep
1536 echo "Inject failure stub to simulate bad lmm_oi"
1537 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1539 createmany -o $DIR/$tdir/f 1
1540 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1541 error "(0) Fail to create PFL $DIR/$tdir/f1"
1542 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1544 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1545 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1548 mdd.${MDT_DEV}.lfsck_layout |
1549 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1551 error "(2) unexpected status"
1554 local repaired=$($SHOW_LAYOUT |
1555 awk '/^repaired_others/ { print $2 }')
1556 [ $repaired -eq 2 ] ||
1557 error "(3) Fail to repair crashed lmm_oi: $repaired"
1559 run_test 13 "LFSCK can repair crashed lmm_oi"
1563 echo "The OST-object referenced by the MDT-object should be there;"
1564 echo "otherwise, the LFSCK should re-create the missing OST-object."
1565 echo "without '--delay-create-ostobj' option."
1568 check_mount_and_prep
1569 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1571 echo "Inject failure stub to simulate dangling referenced MDT-object"
1572 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1573 do_facet ost1 $LCTL set_param fail_loc=0x1610
1574 local count=$(precreated_ost_obj_count 0 0)
1576 createmany -o $DIR/$tdir/f $((count + 16)) ||
1577 error "(0.1) Fail to create $DIR/$tdir/fx"
1578 touch $DIR/$tdir/guard0
1580 for ((i = 0; i < 16; i++)); do
1581 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1582 $DIR/$tdir/f_comp${i} ||
1583 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1585 touch $DIR/$tdir/guard1
1587 do_facet ost1 $LCTL set_param fail_loc=0
1589 start_full_debug_logging
1591 # exhaust other pre-created dangling cases
1592 count=$(precreated_ost_obj_count 0 0)
1593 createmany -o $DIR/$tdir/a $count ||
1594 error "(0.5) Fail to create $count files."
1596 echo "'ls' should fail because of dangling referenced MDT-object"
1597 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1599 echo "Trigger layout LFSCK to find out dangling reference"
1600 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1602 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1603 mdd.${MDT_DEV}.lfsck_layout |
1604 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1606 error "(3) unexpected status"
1609 local repaired=$($SHOW_LAYOUT |
1610 awk '/^repaired_dangling/ { print $2 }')
1611 [ $repaired -ge 32 ] ||
1612 error "(4) Fail to repair dangling reference: $repaired"
1614 echo "'stat' should fail because of not repair dangling by default"
1615 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1616 error "(5.1) stat should fail"
1617 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1618 error "(5.2) stat should fail"
1620 echo "Trigger layout LFSCK to repair dangling reference"
1621 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1623 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1624 mdd.${MDT_DEV}.lfsck_layout |
1625 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1627 error "(7) unexpected status"
1630 # There may be some async LFSCK updates in processing, wait for
1631 # a while until the target reparation has been done. LU-4970.
1633 echo "'stat' should success after layout LFSCK repairing"
1634 wait_update_facet client "stat $DIR/$tdir/guard0 |
1635 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1636 stat $DIR/$tdir/guard0
1638 error "(8.1) unexpected size"
1641 wait_update_facet client "stat $DIR/$tdir/guard1 |
1642 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1643 stat $DIR/$tdir/guard1
1645 error "(8.2) unexpected size"
1648 repaired=$($SHOW_LAYOUT |
1649 awk '/^repaired_dangling/ { print $2 }')
1650 [ $repaired -ge 32 ] ||
1651 error "(9) Fail to repair dangling reference: $repaired"
1653 stop_full_debug_logging
1655 echo "stopall to cleanup object cache"
1658 setupall > /dev/null
1660 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1664 echo "The OST-object referenced by the MDT-object should be there;"
1665 echo "otherwise, the LFSCK should re-create the missing OST-object."
1666 echo "with '--delay-create-ostobj' option."
1669 check_mount_and_prep
1670 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1672 echo "Inject failure stub to simulate dangling referenced MDT-object"
1673 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1674 do_facet ost1 $LCTL set_param fail_loc=0x1610
1675 local count=$(precreated_ost_obj_count 0 0)
1677 createmany -o $DIR/$tdir/f $((count + 31))
1678 touch $DIR/$tdir/guard
1679 do_facet ost1 $LCTL set_param fail_loc=0
1681 start_full_debug_logging
1683 # exhaust other pre-created dangling cases
1684 count=$(precreated_ost_obj_count 0 0)
1685 createmany -o $DIR/$tdir/a $count ||
1686 error "(0) Fail to create $count files."
1688 echo "'ls' should fail because of dangling referenced MDT-object"
1689 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1691 echo "Trigger layout LFSCK to find out dangling reference"
1692 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1694 wait_all_targets_blocked layout completed 3
1696 local repaired=$($SHOW_LAYOUT |
1697 awk '/^repaired_dangling/ { print $2 }')
1698 [ $repaired -ge 32 ] ||
1699 error "(4) Fail to repair dangling reference: $repaired"
1701 echo "'stat' should fail because of not repair dangling by default"
1702 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1704 echo "Trigger layout LFSCK to repair dangling reference"
1705 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1707 wait_all_targets_blocked layout completed 7
1709 # There may be some async LFSCK updates in processing, wait for
1710 # a while until the target reparation has been done. LU-4970.
1712 echo "'stat' should success after layout LFSCK repairing"
1713 wait_update_facet client "stat $DIR/$tdir/guard |
1714 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1715 stat $DIR/$tdir/guard
1717 error "(8) unexpected size"
1720 repaired=$($SHOW_LAYOUT |
1721 awk '/^repaired_dangling/ { print $2 }')
1722 [ $repaired -ge 32 ] ||
1723 error "(9) Fail to repair dangling reference: $repaired"
1725 stop_full_debug_logging
1727 echo "stopall to cleanup object cache"
1730 setupall > /dev/null
1732 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1736 echo "If the OST-object referenced by the MDT-object back points"
1737 echo "to some non-exist MDT-object, then the LFSCK should repair"
1738 echo "the OST-object to back point to the right MDT-object."
1741 check_mount_and_prep
1742 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1744 echo "Inject failure stub to make the OST-object to back point to"
1745 echo "non-exist MDT-object."
1746 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1748 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1749 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1750 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1752 error "(0) Fail to create PFL $DIR/$tdir/f1"
1753 # 'dd' will trigger punch RPC firstly on every OST-objects.
1754 # So even though some OST-object will not be write by 'dd',
1755 # as long as it is allocated (may be NOT allocated in pfl_3b)
1756 # its layout information will be set also.
1757 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1758 cancel_lru_locks osc
1759 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1761 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1762 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1764 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1765 mdd.${MDT_DEV}.lfsck_layout |
1766 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1768 error "(2) unexpected status"
1771 local repaired=$($SHOW_LAYOUT |
1772 awk '/^repaired_unmatched_pair/ { print $2 }')
1773 [ $repaired -ge 3 ] ||
1774 error "(3) Fail to repair unmatched pair: $repaired"
1776 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1780 echo "If the OST-object referenced by the MDT-object back points"
1781 echo "to other MDT-object that doesn't recognize the OST-object,"
1782 echo "then the LFSCK should repair it to back point to the right"
1783 echo "MDT-object (the first one)."
1786 check_mount_and_prep
1787 mkdir -p $DIR/$tdir/0
1788 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1789 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1790 cancel_lru_locks osc
1792 echo "Inject failure stub to make the OST-object to back point to"
1793 echo "other MDT-object"
1796 [ $OSTCOUNT -ge 2 ] && stripes=2
1798 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1799 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1800 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1801 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1803 error "(0) Fail to create PFL $DIR/$tdir/f1"
1804 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1805 cancel_lru_locks osc
1806 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1808 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1809 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1811 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1812 mdd.${MDT_DEV}.lfsck_layout |
1813 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1815 error "(2) unexpected status"
1818 local repaired=$($SHOW_LAYOUT |
1819 awk '/^repaired_unmatched_pair/ { print $2 }')
1820 [ $repaired -eq 4 ] ||
1821 error "(3) Fail to repair unmatched pair: $repaired"
1823 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1826 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1828 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1829 skip "Skip the test after 2.7.55 see LU-6437" && return
1832 echo "According to current metadata migration implementation,"
1833 echo "before the old MDT-object is removed, both the new MDT-object"
1834 echo "and old MDT-object will reference the same LOV layout. Then if"
1835 echo "the layout LFSCK finds the new MDT-object by race, it will"
1836 echo "regard related OST-object(s) as multiple referenced case, and"
1837 echo "will try to create new OST-object(s) for the new MDT-object."
1838 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1839 echo "MDT-object before confirm the multiple referenced case."
1842 check_mount_and_prep
1843 $LFS mkdir -i 1 $DIR/$tdir/a1
1844 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1845 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1846 cancel_lru_locks osc
1848 echo "Inject failure stub on MDT1 to delay the migration"
1850 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1851 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1852 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1853 $LFS migrate -m 0 $DIR/$tdir/a1 &
1856 echo "Trigger layout LFSCK to race with the migration"
1857 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1859 wait_all_targets_blocked layout completed 2
1861 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1862 local repaired=$($SHOW_LAYOUT |
1863 awk '/^repaired_unmatched_pair/ { print $2 }')
1864 [ $repaired -eq 1 ] ||
1865 error "(3) Fail to repair unmatched pair: $repaired"
1867 repaired=$($SHOW_LAYOUT |
1868 awk '/^repaired_multiple_referenced/ { print $2 }')
1869 [ $repaired -eq 0 ] ||
1870 error "(4) Unexpectedly repaird multiple references: $repaired"
1872 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1876 echo "If the OST-object's owner information does not match the owner"
1877 echo "information stored in the MDT-object, then the LFSCK trust the"
1878 echo "MDT-object and update the OST-object's owner information."
1881 check_mount_and_prep
1882 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1883 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1884 cancel_lru_locks osc
1886 # created but no setattr or write to the file.
1888 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1889 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1891 echo "Inject failure stub to skip OST-object owner changing"
1892 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1894 chown 1.1 $DIR/$tdir/f0
1895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1897 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1900 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1902 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1903 mdd.${MDT_DEV}.lfsck_layout |
1904 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1906 error "(2) unexpected status"
1909 local repaired=$($SHOW_LAYOUT |
1910 awk '/^repaired_inconsistent_owner/ { print $2 }')
1911 [ $repaired -eq 1 ] ||
1912 error "(3) Fail to repair inconsistent owner: $repaired"
1914 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1918 echo "If more than one MDT-objects reference the same OST-object,"
1919 echo "and the OST-object only recognizes one MDT-object, then the"
1920 echo "LFSCK should create new OST-objects for such non-recognized"
1924 check_mount_and_prep
1925 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1927 echo "Inject failure stub to make two MDT-objects to refernce"
1928 echo "the OST-object"
1930 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1931 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1932 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1933 cancel_lru_locks mdc
1934 cancel_lru_locks osc
1936 createmany -o $DIR/$tdir/f 1
1937 cancel_lru_locks mdc
1938 cancel_lru_locks osc
1940 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1942 error "(0) Fail to create PFL $DIR/$tdir/f1"
1943 cancel_lru_locks mdc
1944 cancel_lru_locks osc
1945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1947 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1948 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1949 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1950 [ $size -eq 1048576 ] ||
1951 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1953 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1954 [ $size -eq 1048576 ] ||
1955 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1957 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1960 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1962 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1963 mdd.${MDT_DEV}.lfsck_layout |
1964 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1966 error "(3) unexpected status"
1969 local repaired=$($SHOW_LAYOUT |
1970 awk '/^repaired_multiple_referenced/ { print $2 }')
1971 [ $repaired -eq 2 ] ||
1972 error "(4) Fail to repair multiple references: $repaired"
1974 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1975 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1976 error "(5) Fail to write f0."
1977 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1978 [ $size -eq 1048576 ] ||
1979 error "(6) guard size should be 1048576, but got $size"
1981 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1982 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1983 error "(7) Fail to write f1."
1984 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1985 [ $size -eq 1048576 ] ||
1986 error "(8) guard size should be 1048576, but got $size"
1988 run_test 17 "LFSCK can repair multiple references"
1990 $LCTL set_param debug=+cache > /dev/null
1994 echo "The target MDT-object is there, but related stripe information"
1995 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1996 echo "layout EA entries."
1999 check_mount_and_prep
2000 $LFS mkdir -i 0 $DIR/$tdir/a1
2001 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2002 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2004 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2006 $LFS path2fid $DIR/$tdir/a1/f1
2007 $LFS getstripe $DIR/$tdir/a1/f1
2009 if [ $MDSCOUNT -ge 2 ]; then
2010 $LFS mkdir -i 1 $DIR/$tdir/a2
2011 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2012 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2013 $LFS path2fid $DIR/$tdir/a2/f2
2014 $LFS getstripe $DIR/$tdir/a2/f2
2017 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2018 error "(0) Fail to create PFL $DIR/$tdir/f3"
2020 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2022 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2024 $LFS path2fid $DIR/$tdir/f3
2025 $LFS getstripe $DIR/$tdir/f3
2027 cancel_lru_locks osc
2029 echo "Inject failure, to make the MDT-object lost its layout EA"
2030 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2031 do_facet mds1 $LCTL set_param fail_loc=0x1615
2032 chown 1.1 $DIR/$tdir/a1/f1
2034 if [ $MDSCOUNT -ge 2 ]; then
2035 do_facet mds2 $LCTL set_param fail_loc=0x1615
2036 chown 1.1 $DIR/$tdir/a2/f2
2039 chown 1.1 $DIR/$tdir/f3
2044 do_facet mds1 $LCTL set_param fail_loc=0
2045 if [ $MDSCOUNT -ge 2 ]; then
2046 do_facet mds2 $LCTL set_param fail_loc=0
2049 cancel_lru_locks mdc
2050 cancel_lru_locks osc
2052 echo "The file size should be incorrect since layout EA is lost"
2053 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2054 [ "$cur_size" != "$saved_size1" ] ||
2055 error "(1) Expect incorrect file1 size"
2057 if [ $MDSCOUNT -ge 2 ]; then
2058 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2059 [ "$cur_size" != "$saved_size1" ] ||
2060 error "(2) Expect incorrect file2 size"
2063 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2064 [ "$cur_size" != "$saved_size2" ] ||
2065 error "(1.2) Expect incorrect file3 size"
2067 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2068 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2070 for k in $(seq $MDSCOUNT); do
2071 # The LFSCK status query internal is 30 seconds. For the case
2072 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2073 # time to guarantee the status sync up.
2074 wait_update_facet mds${k} "$LCTL get_param -n \
2075 mdd.$(facet_svc mds${k}).lfsck_layout |
2076 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2077 error "(4) MDS${k} is not the expected 'completed'"
2080 for k in $(seq $OSTCOUNT); do
2081 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2082 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2083 awk '/^status/ { print $2 }')
2084 [ "$cur_status" == "completed" ] ||
2085 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2088 local repaired=$(do_facet mds1 $LCTL get_param -n \
2089 mdd.$(facet_svc mds1).lfsck_layout |
2090 awk '/^repaired_orphan/ { print $2 }')
2091 [ $repaired -eq 3 ] ||
2092 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2094 if [ $MDSCOUNT -ge 2 ]; then
2095 repaired=$(do_facet mds2 $LCTL get_param -n \
2096 mdd.$(facet_svc mds2).lfsck_layout |
2097 awk '/^repaired_orphan/ { print $2 }')
2098 [ $repaired -eq 2 ] ||
2099 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2102 $LFS path2fid $DIR/$tdir/a1/f1
2103 $LFS getstripe $DIR/$tdir/a1/f1
2105 if [ $MDSCOUNT -ge 2 ]; then
2106 $LFS path2fid $DIR/$tdir/a2/f2
2107 $LFS getstripe $DIR/$tdir/a2/f2
2110 $LFS path2fid $DIR/$tdir/f3
2111 $LFS getstripe $DIR/$tdir/f3
2113 echo "The file size should be correct after layout LFSCK scanning"
2114 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2115 [ "$cur_size" == "$saved_size1" ] ||
2116 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2118 if [ $MDSCOUNT -ge 2 ]; then
2119 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2120 [ "$cur_size" == "$saved_size1" ] ||
2121 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2124 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2125 [ "$cur_size" == "$saved_size2" ] ||
2126 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2128 run_test 18a "Find out orphan OST-object and repair it (1)"
2131 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2134 echo "The target MDT-object is lost. The LFSCK should re-create the"
2135 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2136 echo "can move it back to normal namespace manually."
2139 check_mount_and_prep
2140 $LFS mkdir -i 0 $DIR/$tdir/a1
2141 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2142 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2143 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2144 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2146 $LFS getstripe $DIR/$tdir/a1/f1
2148 if [ $MDSCOUNT -ge 2 ]; then
2149 $LFS mkdir -i 1 $DIR/$tdir/a2
2150 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2151 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2152 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2154 $LFS getstripe $DIR/$tdir/a2/f2
2157 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2158 error "(0) Fail to create PFL $DIR/$tdir/f3"
2160 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2162 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2163 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2165 $LFS getstripe $DIR/$tdir/f3
2167 cancel_lru_locks osc
2169 echo "Inject failure, to simulate the case of missing the MDT-object"
2170 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2171 do_facet mds1 $LCTL set_param fail_loc=0x1616
2172 rm -f $DIR/$tdir/a1/f1
2174 if [ $MDSCOUNT -ge 2 ]; then
2175 do_facet mds2 $LCTL set_param fail_loc=0x1616
2176 rm -f $DIR/$tdir/a2/f2
2184 do_facet mds1 $LCTL set_param fail_loc=0
2185 if [ $MDSCOUNT -ge 2 ]; then
2186 do_facet mds2 $LCTL set_param fail_loc=0
2189 cancel_lru_locks mdc
2190 cancel_lru_locks osc
2192 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2193 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2195 for k in $(seq $MDSCOUNT); do
2196 # The LFSCK status query internal is 30 seconds. For the case
2197 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2198 # time to guarantee the status sync up.
2199 wait_update_facet mds${k} "$LCTL get_param -n \
2200 mdd.$(facet_svc mds${k}).lfsck_layout |
2201 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2202 error "(2) MDS${k} is not the expected 'completed'"
2205 for k in $(seq $OSTCOUNT); do
2206 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2207 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2208 awk '/^status/ { print $2 }')
2209 [ "$cur_status" == "completed" ] ||
2210 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2213 local repaired=$(do_facet mds1 $LCTL get_param -n \
2214 mdd.$(facet_svc mds1).lfsck_layout |
2215 awk '/^repaired_orphan/ { print $2 }')
2216 [ $repaired -eq 3 ] ||
2217 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2219 if [ $MDSCOUNT -ge 2 ]; then
2220 repaired=$(do_facet mds2 $LCTL get_param -n \
2221 mdd.$(facet_svc mds2).lfsck_layout |
2222 awk '/^repaired_orphan/ { print $2 }')
2223 [ $repaired -eq 2 ] ||
2224 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2227 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2228 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2229 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2231 if [ $MDSCOUNT -ge 2 ]; then
2232 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2233 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2236 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2237 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2239 $LFS path2fid $DIR/$tdir/a1/f1
2240 $LFS getstripe $DIR/$tdir/a1/f1
2242 if [ $MDSCOUNT -ge 2 ]; then
2243 $LFS path2fid $DIR/$tdir/a2/f2
2244 $LFS getstripe $DIR/$tdir/a2/f2
2247 $LFS path2fid $DIR/$tdir/f3
2248 $LFS getstripe $DIR/$tdir/f3
2250 echo "The file size should be correct after layout LFSCK scanning"
2251 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2252 [ "$cur_size" == "$saved_size1" ] ||
2253 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2255 if [ $MDSCOUNT -ge 2 ]; then
2256 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2257 [ "$cur_size" == "$saved_size1" ] ||
2258 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2261 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2262 [ "$cur_size" == "$saved_size2" ] ||
2263 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2265 run_test 18b "Find out orphan OST-object and repair it (2)"
2268 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2271 echo "The target MDT-object is lost, and the OST-object FID is missing."
2272 echo "The LFSCK should re-create the MDT-object with new FID under the "
2273 echo "directory .lustre/lost+found/MDTxxxx."
2276 check_mount_and_prep
2277 $LFS mkdir -i 0 $DIR/$tdir/a1
2278 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2280 echo "Inject failure, to simulate the case of missing parent FID"
2281 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2282 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2284 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2285 $LFS getstripe $DIR/$tdir/a1/f1
2287 if [ $MDSCOUNT -ge 2 ]; then
2288 $LFS mkdir -i 1 $DIR/$tdir/a2
2289 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2290 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2291 $LFS getstripe $DIR/$tdir/a2/f2
2294 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2295 error "(0) Fail to create PFL $DIR/$tdir/f3"
2297 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2298 $LFS getstripe $DIR/$tdir/f3
2300 cancel_lru_locks osc
2301 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2303 echo "Inject failure, to simulate the case of missing the MDT-object"
2304 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2305 do_facet mds1 $LCTL set_param fail_loc=0x1616
2306 rm -f $DIR/$tdir/a1/f1
2308 if [ $MDSCOUNT -ge 2 ]; then
2309 do_facet mds2 $LCTL set_param fail_loc=0x1616
2310 rm -f $DIR/$tdir/a2/f2
2318 do_facet mds1 $LCTL set_param fail_loc=0
2319 if [ $MDSCOUNT -ge 2 ]; then
2320 do_facet mds2 $LCTL set_param fail_loc=0
2323 cancel_lru_locks mdc
2324 cancel_lru_locks osc
2326 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2327 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2329 for k in $(seq $MDSCOUNT); do
2330 # The LFSCK status query internal is 30 seconds. For the case
2331 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2332 # time to guarantee the status sync up.
2333 wait_update_facet mds${k} "$LCTL get_param -n \
2334 mdd.$(facet_svc mds${k}).lfsck_layout |
2335 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2336 error "(2) MDS${k} is not the expected 'completed'"
2339 for k in $(seq $OSTCOUNT); do
2340 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2341 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2342 awk '/^status/ { print $2 }')
2343 [ "$cur_status" == "completed" ] ||
2344 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2347 if [ $MDSCOUNT -ge 2 ]; then
2353 local repaired=$(do_facet mds1 $LCTL get_param -n \
2354 mdd.$(facet_svc mds1).lfsck_layout |
2355 awk '/^repaired_orphan/ { print $2 }')
2356 [ $repaired -eq $expected ] ||
2357 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2359 if [ $MDSCOUNT -ge 2 ]; then
2360 repaired=$(do_facet mds2 $LCTL get_param -n \
2361 mdd.$(facet_svc mds2).lfsck_layout |
2362 awk '/^repaired_orphan/ { print $2 }')
2363 [ $repaired -eq 0 ] ||
2364 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2367 ls -ail $MOUNT/.lustre/lost+found/
2369 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2370 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2371 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2373 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2376 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2377 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2378 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2380 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2381 [ ! -z "$cname" ] ||
2382 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2384 run_test 18c "Find out orphan OST-object and repair it (3)"
2388 echo "The target MDT-object layout EA is corrupted, but the right"
2389 echo "OST-object is still alive as orphan. The layout LFSCK will"
2390 echo "not create new OST-object to occupy such slot."
2393 check_mount_and_prep
2395 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2396 echo "guard" > $DIR/$tdir/a1/f1
2397 echo "foo" > $DIR/$tdir/a1/f2
2399 echo "guard" > $DIR/$tdir/a1/f3
2400 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2401 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2402 echo "foo" > $DIR/$tdir/a1/f4
2404 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2405 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2406 $LFS path2fid $DIR/$tdir/a1/f1
2407 $LFS getstripe $DIR/$tdir/a1/f1
2408 $LFS path2fid $DIR/$tdir/a1/f2
2409 $LFS getstripe $DIR/$tdir/a1/f2
2410 $LFS path2fid $DIR/$tdir/a1/f3
2411 $LFS getstripe $DIR/$tdir/a1/f3
2412 $LFS path2fid $DIR/$tdir/a1/f4
2413 $LFS getstripe $DIR/$tdir/a1/f4
2414 cancel_lru_locks osc
2416 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2417 echo "to reference the same OST-object (which is f1's OST-obejct)."
2418 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2419 echo "dangling reference case, but f2's old OST-object is there."
2421 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2422 echo "to reference the same OST-object (which is f3's OST-obejct)."
2423 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2424 echo "dangling reference case, but f4's old OST-object is there."
2427 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2429 chown 1.1 $DIR/$tdir/a1/f2
2430 chown 1.1 $DIR/$tdir/a1/f4
2431 rm -f $DIR/$tdir/a1/f1
2432 rm -f $DIR/$tdir/a1/f3
2435 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2437 echo "stopall to cleanup object cache"
2440 setupall > /dev/null
2442 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2443 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2445 for k in $(seq $MDSCOUNT); do
2446 # The LFSCK status query internal is 30 seconds. For the case
2447 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2448 # time to guarantee the status sync up.
2449 wait_update_facet mds${k} "$LCTL get_param -n \
2450 mdd.$(facet_svc mds${k}).lfsck_layout |
2451 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2452 error "(3) MDS${k} is not the expected 'completed'"
2455 for k in $(seq $OSTCOUNT); do
2456 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2457 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2458 awk '/^status/ { print $2 }')
2459 [ "$cur_status" == "completed" ] ||
2460 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2463 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2464 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2465 awk '/^repaired_orphan/ { print $2 }')
2466 [ $repaired -eq 2 ] ||
2467 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2469 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2470 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2471 awk '/^repaired_dangling/ { print $2 }')
2472 [ $repaired -eq 0 ] ||
2473 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2475 echo "The file size should be correct after layout LFSCK scanning"
2476 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2477 [ "$cur_size" == "$saved_size1" ] ||
2478 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2480 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2481 [ "$cur_size" == "$saved_size2" ] ||
2482 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2484 echo "The LFSCK should find back the original data."
2485 cat $DIR/$tdir/a1/f2
2486 $LFS path2fid $DIR/$tdir/a1/f2
2487 $LFS getstripe $DIR/$tdir/a1/f2
2488 cat $DIR/$tdir/a1/f4
2489 $LFS path2fid $DIR/$tdir/a1/f4
2490 $LFS getstripe $DIR/$tdir/a1/f4
2492 run_test 18d "Find out orphan OST-object and repair it (4)"
2495 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2498 echo "The target MDT-object layout EA slot is occpuied by some new"
2499 echo "created OST-object when repair dangling reference case. Such"
2500 echo "conflict OST-object has been modified by others. To keep the"
2501 echo "new data, the LFSCK will create a new file to refernece this"
2502 echo "old orphan OST-object."
2505 check_mount_and_prep
2507 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2508 echo "guard" > $DIR/$tdir/a1/f1
2509 echo "foo" > $DIR/$tdir/a1/f2
2511 echo "guard" > $DIR/$tdir/a1/f3
2512 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2513 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2514 echo "foo" > $DIR/$tdir/a1/f4
2516 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2517 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2519 $LFS path2fid $DIR/$tdir/a1/f1
2520 $LFS getstripe $DIR/$tdir/a1/f1
2521 $LFS path2fid $DIR/$tdir/a1/f2
2522 $LFS getstripe $DIR/$tdir/a1/f2
2523 $LFS path2fid $DIR/$tdir/a1/f3
2524 $LFS getstripe $DIR/$tdir/a1/f3
2525 $LFS path2fid $DIR/$tdir/a1/f4
2526 $LFS getstripe $DIR/$tdir/a1/f4
2527 cancel_lru_locks osc
2529 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2530 echo "to reference the same OST-object (which is f1's OST-obejct)."
2531 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2532 echo "dangling reference case, but f2's old OST-object is there."
2534 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2535 echo "to reference the same OST-object (which is f3's OST-obejct)."
2536 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2537 echo "dangling reference case, but f4's old OST-object is there."
2540 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2542 chown 1.1 $DIR/$tdir/a1/f2
2543 chown 1.1 $DIR/$tdir/a1/f4
2544 rm -f $DIR/$tdir/a1/f1
2545 rm -f $DIR/$tdir/a1/f3
2548 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2550 echo "stopall to cleanup object cache"
2553 setupall > /dev/null
2555 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2556 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2558 start_full_debug_logging
2560 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2561 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2563 wait_update_facet mds1 "$LCTL get_param -n \
2564 mdd.$(facet_svc mds1).lfsck_layout |
2565 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2566 error "(3) MDS1 is not the expected 'scanning-phase2'"
2568 # to guarantee all updates are synced.
2572 echo "Write new data to f2/f4 to modify the new created OST-object."
2573 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2574 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2576 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2578 for k in $(seq $MDSCOUNT); do
2579 # The LFSCK status query internal is 30 seconds. For the case
2580 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2581 # time to guarantee the status sync up.
2582 wait_update_facet mds${k} "$LCTL get_param -n \
2583 mdd.$(facet_svc mds${k}).lfsck_layout |
2584 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2585 error "(4) MDS${k} is not the expected 'completed'"
2588 for k in $(seq $OSTCOUNT); do
2589 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2590 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2591 awk '/^status/ { print $2 }')
2592 [ "$cur_status" == "completed" ] ||
2593 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2596 stop_full_debug_logging
2598 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2599 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2600 awk '/^repaired_orphan/ { print $2 }')
2601 [ $repaired -eq 2 ] ||
2602 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2604 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2605 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2606 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2608 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2609 if [ $count -ne 2 ]; then
2610 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2611 error "(8) Expect 2 stubs under lost+found, but got $count"
2614 echo "The stub file should keep the original f2 or f4 data"
2615 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2616 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2617 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2618 error "(9) Got unexpected $cur_size"
2621 $LFS path2fid $cname
2622 $LFS getstripe $cname
2624 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2625 cur_size=$(ls -il $cname | awk '{ print $6 }')
2626 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2627 error "(10) Got unexpected $cur_size"
2630 $LFS path2fid $cname
2631 $LFS getstripe $cname
2633 echo "The f2/f4 should contains new data."
2634 cat $DIR/$tdir/a1/f2
2635 $LFS path2fid $DIR/$tdir/a1/f2
2636 $LFS getstripe $DIR/$tdir/a1/f2
2637 cat $DIR/$tdir/a1/f4
2638 $LFS path2fid $DIR/$tdir/a1/f4
2639 $LFS getstripe $DIR/$tdir/a1/f4
2641 run_test 18e "Find out orphan OST-object and repair it (5)"
2644 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2647 echo "The target MDT-object is lost. The LFSCK should re-create the"
2648 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2649 echo "to verify some OST-object(s) during the first stage-scanning,"
2650 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2651 echo "should not be affected."
2654 check_mount_and_prep
2655 $LFS mkdir -i 0 $DIR/$tdir/a1
2656 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2657 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2658 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2659 $LFS mkdir -i 0 $DIR/$tdir/a2
2660 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2661 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2662 $LFS getstripe $DIR/$tdir/a1/f1
2663 $LFS getstripe $DIR/$tdir/a2/f2
2665 if [ $MDSCOUNT -ge 2 ]; then
2666 $LFS mkdir -i 1 $DIR/$tdir/a3
2667 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2668 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2669 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2670 $LFS mkdir -i 1 $DIR/$tdir/a4
2671 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2672 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2673 $LFS getstripe $DIR/$tdir/a3/f3
2674 $LFS getstripe $DIR/$tdir/a4/f4
2677 cancel_lru_locks osc
2679 echo "Inject failure, to simulate the case of missing the MDT-object"
2680 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2681 do_facet mds1 $LCTL set_param fail_loc=0x1616
2682 rm -f $DIR/$tdir/a1/f1
2683 rm -f $DIR/$tdir/a2/f2
2685 if [ $MDSCOUNT -ge 2 ]; then
2686 do_facet mds2 $LCTL set_param fail_loc=0x1616
2687 rm -f $DIR/$tdir/a3/f3
2688 rm -f $DIR/$tdir/a4/f4
2694 do_facet mds1 $LCTL set_param fail_loc=0
2695 if [ $MDSCOUNT -ge 2 ]; then
2696 do_facet mds2 $LCTL set_param fail_loc=0
2699 cancel_lru_locks mdc
2700 cancel_lru_locks osc
2702 echo "Inject failure, to simulate the OST0 fail to handle"
2703 echo "MDT0 LFSCK request during the first-stage scanning."
2704 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2705 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2707 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2708 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2710 for k in $(seq $MDSCOUNT); do
2711 # The LFSCK status query internal is 30 seconds. For the case
2712 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2713 # time to guarantee the status sync up.
2714 wait_update_facet mds${k} "$LCTL get_param -n \
2715 mdd.$(facet_svc mds${k}).lfsck_layout |
2716 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2717 error "(2) MDS${k} is not the expected 'partial'"
2720 wait_update_facet ost1 "$LCTL get_param -n \
2721 obdfilter.$(facet_svc ost1).lfsck_layout |
2722 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2723 error "(3) OST1 is not the expected 'partial'"
2726 wait_update_facet ost2 "$LCTL get_param -n \
2727 obdfilter.$(facet_svc ost2).lfsck_layout |
2728 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2729 error "(4) OST2 is not the expected 'completed'"
2732 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2734 local repaired=$(do_facet mds1 $LCTL get_param -n \
2735 mdd.$(facet_svc mds1).lfsck_layout |
2736 awk '/^repaired_orphan/ { print $2 }')
2737 [ $repaired -eq 1 ] ||
2738 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2740 if [ $MDSCOUNT -ge 2 ]; then
2741 repaired=$(do_facet mds2 $LCTL get_param -n \
2742 mdd.$(facet_svc mds2).lfsck_layout |
2743 awk '/^repaired_orphan/ { print $2 }')
2744 [ $repaired -eq 1 ] ||
2745 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2748 echo "Trigger layout LFSCK on all devices again to cleanup"
2749 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2751 for k in $(seq $MDSCOUNT); do
2752 # The LFSCK status query internal is 30 seconds. For the case
2753 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2754 # time to guarantee the status sync up.
2755 wait_update_facet mds${k} "$LCTL get_param -n \
2756 mdd.$(facet_svc mds${k}).lfsck_layout |
2757 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2758 error "(8) MDS${k} is not the expected 'completed'"
2761 for k in $(seq $OSTCOUNT); do
2762 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2763 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2764 awk '/^status/ { print $2 }')
2765 [ "$cur_status" == "completed" ] ||
2766 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2770 local repaired=$(do_facet mds1 $LCTL get_param -n \
2771 mdd.$(facet_svc mds1).lfsck_layout |
2772 awk '/^repaired_orphan/ { print $2 }')
2773 [ $repaired -eq 2 ] ||
2774 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2776 if [ $MDSCOUNT -ge 2 ]; then
2777 repaired=$(do_facet mds2 $LCTL get_param -n \
2778 mdd.$(facet_svc mds2).lfsck_layout |
2779 awk '/^repaired_orphan/ { print $2 }')
2780 [ $repaired -eq 2 ] ||
2781 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2784 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2787 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2790 echo "The target MDT-object is lost, but related OI mapping is there"
2791 echo "The LFSCK should recreate the lost MDT-object without affected"
2792 echo "by the stale OI mapping."
2795 check_mount_and_prep
2796 $LFS mkdir -i 0 $DIR/$tdir/a1
2797 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2798 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2799 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2801 $LFS getstripe $DIR/$tdir/a1/f1
2802 cancel_lru_locks osc
2804 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2805 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2806 do_facet mds1 $LCTL set_param fail_loc=0x162e
2807 rm -f $DIR/$tdir/a1/f1
2809 do_facet mds1 $LCTL set_param fail_loc=0
2810 cancel_lru_locks mdc
2811 cancel_lru_locks osc
2813 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2814 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2816 for k in $(seq $MDSCOUNT); do
2817 # The LFSCK status query internal is 30 seconds. For the case
2818 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2819 # time to guarantee the status sync up.
2820 wait_update_facet mds${k} "$LCTL get_param -n \
2821 mdd.$(facet_svc mds${k}).lfsck_layout |
2822 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2823 error "(2) MDS${k} is not the expected 'completed'"
2826 for k in $(seq $OSTCOUNT); do
2827 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2828 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2829 awk '/^status/ { print $2 }')
2830 [ "$cur_status" == "completed" ] ||
2831 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2834 local repaired=$(do_facet mds1 $LCTL get_param -n \
2835 mdd.$(facet_svc mds1).lfsck_layout |
2836 awk '/^repaired_orphan/ { print $2 }')
2837 [ $repaired -eq $OSTCOUNT ] ||
2838 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2840 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2841 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2842 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2844 $LFS path2fid $DIR/$tdir/a1/f1
2845 $LFS getstripe $DIR/$tdir/a1/f1
2847 run_test 18g "Find out orphan OST-object and repair it (7)"
2851 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2852 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2853 echo "scanning its OST-object(s). Then in the second stage scanning,"
2854 echo "the OST will return related OST-object(s) to the MDT as orphan."
2855 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2856 echo "the 'orphan(s)' stripe information."
2859 check_mount_and_prep
2861 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2862 error "(0) Fail to create PFL $DIR/$tdir/f0"
2864 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2865 error "(1.1) Fail to write $DIR/$tdir/f0"
2867 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2868 error "(1.2) Fail to write $DIR/$tdir/f0"
2870 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2872 echo "Inject failure stub to simulate bad PFL extent range"
2873 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2874 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2876 chown 1.1 $DIR/$tdir/f0
2878 cancel_lru_locks mdc
2879 cancel_lru_locks osc
2880 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2882 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2883 error "(2) Write to bad PFL file should fail"
2885 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2886 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2888 for k in $(seq $MDSCOUNT); do
2889 # The LFSCK status query internal is 30 seconds. For the case
2890 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2891 # time to guarantee the status sync up.
2892 wait_update_facet mds${k} "$LCTL get_param -n \
2893 mdd.$(facet_svc mds${k}).lfsck_layout |
2894 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2895 error "(4.1) MDS${k} is not the expected 'completed'"
2898 for k in $(seq $OSTCOUNT); do
2899 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2900 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2901 awk '/^status/ { print $2 }')
2902 [ "$cur_status" == "completed" ] ||
2903 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2907 local repaired=$($SHOW_LAYOUT |
2908 awk '/^repaired_orphan/ { print $2 }')
2909 [ $repaired -eq 2 ] ||
2910 error "(5) Fail to repair crashed PFL range: $repaired"
2912 echo "Data in $DIR/$tdir/f0 should not be broken"
2913 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2914 error "(6) Data in $DIR/$tdir/f0 is broken"
2916 echo "Write should succeed after LFSCK repairing the bad PFL range"
2917 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2918 error "(7) Write should succeed after LFSCK"
2920 run_test 18h "LFSCK can repair crashed PFL extent range"
2922 $LCTL set_param debug=-cache > /dev/null
2925 check_mount_and_prep
2926 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2928 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2929 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2931 echo "foo1" > $DIR/$tdir/a0
2932 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2933 error "(0) Fail to create PFL $DIR/$tdir/a1"
2934 echo "foo2" > $DIR/$tdir/a1
2935 echo "guard" > $DIR/$tdir/a2
2936 cancel_lru_locks osc
2938 echo "Inject failure, then client will offer wrong parent FID when read"
2939 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2940 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2942 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2943 $LCTL set_param fail_loc=0x1619
2945 echo "Read RPC with wrong parent FID should be denied"
2946 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2947 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2948 $LCTL set_param fail_loc=0
2950 run_test 19a "OST-object inconsistency self detect"
2953 check_mount_and_prep
2954 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2956 echo "Inject failure stub to make the OST-object to back point to"
2957 echo "non-exist MDT-object"
2959 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2960 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2962 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2963 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2964 echo "foo1" > $DIR/$tdir/f0
2965 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2966 error "(0) Fail to create PFL $DIR/$tdir/f1"
2967 echo "foo2" > $DIR/$tdir/f1
2968 cancel_lru_locks osc
2969 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2971 do_facet ost1 $LCTL set_param -n \
2972 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2973 echo "Nothing should be fixed since self detect and repair is disabled"
2974 local repaired=$(do_facet ost1 $LCTL get_param -n \
2975 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2976 awk '/^repaired/ { print $2 }')
2977 [ $repaired -eq 0 ] ||
2978 error "(1) Expected 0 repaired, but got $repaired"
2980 echo "Read RPC with right parent FID should be accepted,"
2981 echo "and cause parent FID on OST to be fixed"
2983 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2984 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2986 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2987 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2989 repaired=$(do_facet ost1 $LCTL get_param -n \
2990 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2991 awk '/^repaired/ { print $2 }')
2992 [ $repaired -eq 2 ] ||
2993 error "(3) Expected 1 repaired, but got $repaired"
2995 run_test 19b "OST-object inconsistency self repair"
2997 PATTERN_WITH_HOLE="40000001"
2998 PATTERN_WITHOUT_HOLE="raid0"
3001 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3002 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3005 echo "The target MDT-object and some of its OST-object are lost."
3006 echo "The LFSCK should find out the left OST-objects and re-create"
3007 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3008 echo "with the partial OST-objects (LOV EA hole)."
3010 echo "New client can access the file with LOV EA hole via normal"
3011 echo "system tools or commands without crash the system."
3013 echo "For old client, even though it cannot access the file with"
3014 echo "LOV EA hole, it should not cause the system crash."
3017 check_mount_and_prep
3018 $LFS mkdir -i 0 $DIR/$tdir/a1
3019 if [ $OSTCOUNT -gt 2 ]; then
3020 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3023 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3027 # 256 blocks on the stripe0.
3028 # 1 block on the stripe1 for 2 OSTs case.
3029 # 256 blocks on the stripe1 for other cases.
3030 # 1 block on the stripe2 if OSTs > 2
3031 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3032 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3033 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3035 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3036 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3037 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3040 $LFS getstripe $DIR/$tdir/a1/f0
3042 $LFS getstripe $DIR/$tdir/a1/f1
3044 $LFS getstripe $DIR/$tdir/a1/f2
3046 if [ $OSTCOUNT -gt 2 ]; then
3047 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3048 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3050 $LFS getstripe $DIR/$tdir/a1/f3
3053 cancel_lru_locks osc
3055 echo "Inject failure..."
3056 echo "To simulate f0 lost MDT-object"
3057 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3058 do_facet mds1 $LCTL set_param fail_loc=0x1616
3059 rm -f $DIR/$tdir/a1/f0
3061 echo "To simulate f1 lost MDT-object and OST-object0"
3062 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3063 do_facet mds1 $LCTL set_param fail_loc=0x161a
3064 rm -f $DIR/$tdir/a1/f1
3066 echo "To simulate f2 lost MDT-object and OST-object1"
3067 do_facet mds1 $LCTL set_param fail_val=1
3068 rm -f $DIR/$tdir/a1/f2
3070 if [ $OSTCOUNT -gt 2 ]; then
3071 echo "To simulate f3 lost MDT-object and OST-object2"
3072 do_facet mds1 $LCTL set_param fail_val=2
3073 rm -f $DIR/$tdir/a1/f3
3076 umount_client $MOUNT
3079 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3081 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3082 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3084 for k in $(seq $MDSCOUNT); do
3085 # The LFSCK status query internal is 30 seconds. For the case
3086 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3087 # time to guarantee the status sync up.
3088 wait_update_facet mds${k} "$LCTL get_param -n \
3089 mdd.$(facet_svc mds${k}).lfsck_layout |
3090 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3091 error "(2) MDS${k} is not the expected 'completed'"
3094 for k in $(seq $OSTCOUNT); do
3095 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3096 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3097 awk '/^status/ { print $2 }')
3098 [ "$cur_status" == "completed" ] ||
3099 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3102 local repaired=$(do_facet mds1 $LCTL get_param -n \
3103 mdd.$(facet_svc mds1).lfsck_layout |
3104 awk '/^repaired_orphan/ { print $2 }')
3105 if [ $OSTCOUNT -gt 2 ]; then
3106 [ $repaired -eq 9 ] ||
3107 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3109 [ $repaired -eq 4 ] ||
3110 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3113 mount_client $MOUNT || error "(5.0) Fail to start client!"
3115 LOV_PATTERN_F_HOLE=0x40000000
3118 # ${fid0}-R-0 is the old f0
3120 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3121 echo "Check $name, which is the old f0"
3123 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3125 local pattern=$($LFS getstripe -L $name)
3126 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3127 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3129 local stripes=$($LFS getstripe -c $name)
3130 if [ $OSTCOUNT -gt 2 ]; then
3131 [ $stripes -eq 3 ] ||
3132 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3134 [ $stripes -eq 2 ] ||
3135 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3138 local size=$(stat $name | awk '/Size:/ { print $2 }')
3139 [ $size -eq $((4096 * $bcount)) ] ||
3140 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3142 cat $name > /dev/null || error "(5.5) cannot read $name"
3144 echo "dummy" >> $name || error "(5.6) cannot write $name"
3146 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3148 touch $name || error "(5.8) cannot touch $name"
3150 rm -f $name || error "(5.9) cannot unlink $name"
3153 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3155 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3156 if [ $OSTCOUNT -gt 2 ]; then
3157 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3159 echo "Check $name, it contains the old f1's stripe1"
3162 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3164 pattern=$($LFS getstripe -L $name)
3165 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3166 error "(6.2) expect pattern flag hole, but got $pattern"
3168 stripes=$($LFS getstripe -c $name)
3169 if [ $OSTCOUNT -gt 2 ]; then
3170 [ $stripes -eq 3 ] ||
3171 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3173 [ $stripes -eq 2 ] ||
3174 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3177 size=$(stat $name | awk '/Size:/ { print $2 }')
3178 [ $size -eq $((4096 * $bcount)) ] ||
3179 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3181 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3183 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3184 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3187 [ $failures -eq 256 ] ||
3188 error "(6.6) expect 256 IO failures, but get $failures"
3190 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3191 [ $size -eq $((4096 * $bcount)) ] ||
3192 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3194 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3195 error "(6.8) write to the LOV EA hole should fail"
3197 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3198 error "(6.9) write to normal stripe should NOT fail"
3200 echo "foo" >> $name && error "(6.10) append write $name should fail"
3202 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3204 touch $name || error "(6.12) cannot touch $name"
3206 rm -f $name || error "(6.13) cannot unlink $name"
3209 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3211 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3212 if [ $OSTCOUNT -gt 2 ]; then
3213 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3215 echo "Check $name, it contains the old f2's stripe0"
3218 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3220 pattern=$($LFS getstripe -L $name)
3221 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3222 error "(7.2) expect pattern flag hole, but got $pattern"
3224 stripes=$($LFS getstripe -c $name)
3225 size=$(stat $name | awk '/Size:/ { print $2 }')
3226 if [ $OSTCOUNT -gt 2 ]; then
3227 [ $stripes -eq 3 ] ||
3228 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3230 [ $size -eq $((4096 * $bcount)) ] ||
3231 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3233 cat $name > /dev/null &&
3234 error "(7.5.1) normal read $name should fail"
3236 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3237 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3239 [ $failures -eq 256 ] ||
3240 error "(7.6) expect 256 IO failures, but get $failures"
3242 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3243 [ $size -eq $((4096 * $bcount)) ] ||
3244 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3246 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3247 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3249 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3250 error "(7.8.1) write to normal stripe should NOT fail"
3252 echo "foo" >> $name &&
3253 error "(7.8.3) append write $name should fail"
3255 chown $RUNAS_ID:$RUNAS_GID $name ||
3256 error "(7.9.1) cannot chown on $name"
3258 touch $name || error "(7.10.1) cannot touch $name"
3260 [ $stripes -eq 2 ] ||
3261 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3264 [ $size -eq $((4096 * (256 + 0))) ] ||
3265 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3267 cat $name > /dev/null &&
3268 error "(7.5.2) normal read $name should fail"
3270 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3271 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3272 [ $failures -eq 256 ] ||
3273 error "(7.6.2) expect 256 IO failures, but get $failures"
3276 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3277 [ $size -eq $((4096 * $bcount)) ] ||
3278 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3280 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3281 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3283 chown $RUNAS_ID:$RUNAS_GID $name ||
3284 error "(7.9.2) cannot chown on $name"
3286 touch $name || error "(7.10.2) cannot touch $name"
3289 rm -f $name || error "(7.11) cannot unlink $name"
3291 [ $OSTCOUNT -le 2 ] && return
3294 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3296 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3297 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3299 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3301 pattern=$($LFS getstripe -L $name)
3302 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3303 error "(8.2) expect pattern flag hole, but got $pattern"
3305 stripes=$($LFS getstripe -c $name)
3306 [ $stripes -eq 3 ] ||
3307 error "(8.3) expect the stripe count is 3, but got $stripes"
3309 size=$(stat $name | awk '/Size:/ { print $2 }')
3311 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3312 error "(8.4) expect the size $((4096 * 512)), but got $size"
3314 cat $name > /dev/null &&
3315 error "(8.5) normal read $name should fail"
3317 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3318 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3320 [ $failures -eq 256 ] ||
3321 error "(8.6) expect 256 IO failures, but get $failures"
3324 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3325 [ $size -eq $((4096 * $bcount)) ] ||
3326 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3328 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3329 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3331 chown $RUNAS_ID:$RUNAS_GID $name ||
3332 error "(8.9) cannot chown on $name"
3334 touch $name || error "(8.10) cannot touch $name"
3336 rm -f $name || error "(8.11) cannot unlink $name"
3338 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3341 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3342 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3345 echo "The target MDT-object and some of its OST-object are lost."
3346 echo "The LFSCK should find out the left OST-objects and re-create"
3347 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3348 echo "with the partial OST-objects (LOV EA hole)."
3350 echo "New client can access the file with LOV EA hole via normal"
3351 echo "system tools or commands without crash the system - PFL case."
3354 check_mount_and_prep
3356 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3357 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3358 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3359 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3360 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3361 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3363 local bcount=$((256 * 3 + 1))
3365 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3366 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3367 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3369 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3370 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3371 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3374 $LFS getstripe $DIR/$tdir/f0
3376 $LFS getstripe $DIR/$tdir/f1
3378 $LFS getstripe $DIR/$tdir/f2
3380 cancel_lru_locks mdc
3381 cancel_lru_locks osc
3383 echo "Inject failure..."
3384 echo "To simulate f0 lost MDT-object"
3385 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3389 echo "To simulate the case of f1 lost MDT-object and "
3390 echo "the first OST-object in each PFL component"
3391 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3395 echo "To simulate the case of f2 lost MDT-object and "
3396 echo "the second OST-object in each PFL component"
3397 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3402 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3404 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3405 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3407 for k in $(seq $MDSCOUNT); do
3408 # The LFSCK status query internal is 30 seconds. For the case
3409 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3410 # time to guarantee the status sync up.
3411 wait_update_facet mds${k} "$LCTL get_param -n \
3412 mdd.$(facet_svc mds${k}).lfsck_layout |
3413 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3414 error "(4) MDS${k} is not the expected 'completed'"
3417 for k in $(seq $OSTCOUNT); do
3418 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3419 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3420 awk '/^status/ { print $2 }')
3421 [ "$cur_status" == "completed" ] ||
3422 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3425 local repaired=$(do_facet mds1 $LCTL get_param -n \
3426 mdd.$(facet_svc mds1).lfsck_layout |
3427 awk '/^repaired_orphan/ { print $2 }')
3428 [ $repaired -eq 8 ] ||
3429 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3432 # ${fid0}-R-0 is the old f0
3434 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3435 echo "Check $name, which is the old f0"
3437 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3439 local pattern=$($LFS getstripe -L -I1 $name)
3440 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3441 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3443 pattern=$($LFS getstripe -L -I2 $name)
3444 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3445 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3447 local stripes=$($LFS getstripe -c -I1 $name)
3448 [ $stripes -eq 2 ] ||
3449 error "(7.3.1) expect 2 stripes, but got $stripes"
3451 stripes=$($LFS getstripe -c -I2 $name)
3452 [ $stripes -eq 2 ] ||
3453 error "(7.3.2) expect 2 stripes, but got $stripes"
3455 local e_start=$($LFS getstripe -I1 $name |
3456 awk '/lcme_extent.e_start:/ { print $2 }')
3457 [ $e_start -eq 0 ] ||
3458 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3460 local e_end=$($LFS getstripe -I1 $name |
3461 awk '/lcme_extent.e_end:/ { print $2 }')
3462 [ $e_end -eq 2097152 ] ||
3463 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3465 e_start=$($LFS getstripe -I2 $name |
3466 awk '/lcme_extent.e_start:/ { print $2 }')
3467 [ $e_start -eq 2097152 ] ||
3468 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3470 e_end=$($LFS getstripe -I2 $name |
3471 awk '/lcme_extent.e_end:/ { print $2 }')
3472 [ "$e_end" = "EOF" ] ||
3473 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3475 local size=$(stat $name | awk '/Size:/ { print $2 }')
3476 [ $size -eq $((4096 * $bcount)) ] ||
3477 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3479 cat $name > /dev/null || error "(7.7) cannot read $name"
3481 echo "dummy" >> $name || error "(7.8) cannot write $name"
3483 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3485 touch $name || error "(7.10) cannot touch $name"
3487 rm -f $name || error "(7.11) cannot unlink $name"
3490 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3492 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3493 echo "Check $name, it contains f1's second OST-object in each COMP"
3495 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3497 pattern=$($LFS getstripe -L -I1 $name)
3498 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3499 error "(8.2.1) expect pattern flag hole, but got $pattern"
3501 pattern=$($LFS getstripe -L -I2 $name)
3502 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3503 error "(8.2.2) expect pattern flag hole, but got $pattern"
3505 stripes=$($LFS getstripe -c -I1 $name)
3506 [ $stripes -eq 2 ] ||
3507 error "(8.3.2) expect 2 stripes, but got $stripes"
3509 stripes=$($LFS getstripe -c -I2 $name)
3510 [ $stripes -eq 2 ] ||
3511 error "(8.3.2) expect 2 stripes, but got $stripes"
3513 e_start=$($LFS getstripe -I1 $name |
3514 awk '/lcme_extent.e_start:/ { print $2 }')
3515 [ $e_start -eq 0 ] ||
3516 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3518 e_end=$($LFS getstripe -I1 $name |
3519 awk '/lcme_extent.e_end:/ { print $2 }')
3520 [ $e_end -eq 2097152 ] ||
3521 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3523 e_start=$($LFS getstripe -I2 $name |
3524 awk '/lcme_extent.e_start:/ { print $2 }')
3525 [ $e_start -eq 2097152 ] ||
3526 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3528 e_end=$($LFS getstripe -I2 $name |
3529 awk '/lcme_extent.e_end:/ { print $2 }')
3530 [ "$e_end" = "EOF" ] ||
3531 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3533 size=$(stat $name | awk '/Size:/ { print $2 }')
3534 [ $size -eq $((4096 * $bcount)) ] ||
3535 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3537 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3539 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3540 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3542 # The first stripe in each COMP was lost
3543 [ $failures -eq 512 ] ||
3544 error "(8.8) expect 512 IO failures, but get $failures"
3546 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3547 [ $size -eq $((4096 * $bcount)) ] ||
3548 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3550 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3551 error "(8.10) write to the LOV EA hole should fail"
3553 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3554 error "(8.11) write to normal stripe should NOT fail"
3556 echo "foo" >> $name && error "(8.12) append write $name should fail"
3558 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3560 touch $name || error "(8.14) cannot touch $name"
3562 rm -f $name || error "(8.15) cannot unlink $name"
3565 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3567 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3568 echo "Check $name, it contains f2's first stripe in each COMP"
3570 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3572 pattern=$($LFS getstripe -L -I1 $name)
3573 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3574 error "(9.2.1) expect pattern flag hole, but got $pattern"
3576 pattern=$($LFS getstripe -L -I2 $name)
3577 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3578 error "(9.2.2) expect pattern flag hole, but got $pattern"
3580 stripes=$($LFS getstripe -c -I1 $name)
3581 [ $stripes -eq 2 ] ||
3582 error "(9.3.2) expect 2 stripes, but got $stripes"
3584 stripes=$($LFS getstripe -c -I2 $name)
3585 [ $stripes -eq 2 ] ||
3586 error "(9.3.2) expect 2 stripes, but got $stripes"
3588 e_start=$($LFS getstripe -I1 $name |
3589 awk '/lcme_extent.e_start:/ { print $2 }')
3590 [ $e_start -eq 0 ] ||
3591 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3593 e_end=$($LFS getstripe -I1 $name |
3594 awk '/lcme_extent.e_end:/ { print $2 }')
3595 [ $e_end -eq 2097152 ] ||
3596 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3598 e_start=$($LFS getstripe -I2 $name |
3599 awk '/lcme_extent.e_start:/ { print $2 }')
3600 [ $e_start -eq 2097152 ] ||
3601 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3603 e_end=$($LFS getstripe -I2 $name |
3604 awk '/lcme_extent.e_end:/ { print $2 }')
3605 [ "$e_end" = "EOF" ] ||
3606 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3608 size=$(stat $name | awk '/Size:/ { print $2 }')
3609 # The second stripe in COMP was lost, so we do not know there
3610 # have ever been some data before. 'stat' will regard it as
3611 # no data on the lost stripe.
3613 [ $size -eq $((4096 * $bcount)) ] ||
3614 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3616 cat $name > /dev/null &&
3617 error "(9.7) normal read $name should fail"
3619 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3620 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3621 [ $failures -eq 512 ] ||
3622 error "(9.8) expect 256 IO failures, but get $failures"
3624 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3625 # The second stripe in COMP was lost, so we do not know there
3626 # have ever been some data before. Since 'dd' skip failure,
3627 # it will regard the lost stripe contains data.
3629 [ $size -eq $((4096 * $bcount)) ] ||
3630 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3632 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3633 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3635 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3636 error "(9.11) write to normal stripe should NOT fail"
3638 echo "foo" >> $name &&
3639 error "(9.12) append write $name should fail"
3641 chown $RUNAS_ID:$RUNAS_GID $name ||
3642 error "(9.13) cannot chown on $name"
3644 touch $name || error "(9.14) cannot touch $name"
3646 rm -f $name || error "(7.15) cannot unlink $name"
3648 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3651 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3652 skip "ignore the test if MDS is older than 2.5.59" && return
3654 check_mount_and_prep
3655 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3657 echo "Start all LFSCK components by default (-s 1)"
3658 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3659 error "Fail to start LFSCK"
3661 echo "namespace LFSCK should be in 'scanning-phase1' status"
3662 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3663 [ "$STATUS" == "scanning-phase1" ] ||
3664 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3666 echo "layout LFSCK should be in 'scanning-phase1' status"
3667 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3668 [ "$STATUS" == "scanning-phase1" ] ||
3669 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3671 echo "Stop all LFSCK components by default"
3672 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3673 error "Fail to stop LFSCK"
3675 run_test 21 "run all LFSCK components by default"
3678 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3681 echo "The parent_A references the child directory via some name entry,"
3682 echo "but the child directory back references another parent_B via its"
3683 echo "".." name entry. The parent_B does not exist. Then the namespace"
3684 echo "LFSCK will repair the child directory's ".." name entry."
3687 check_mount_and_prep
3689 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3690 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3692 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3693 echo "The dummy's dotdot name entry references the guard."
3694 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3696 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3697 error "(3) Fail to mkdir on MDT0"
3698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3700 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3702 echo "Trigger namespace LFSCK to repair unmatched pairs"
3703 $START_NAMESPACE -A -r ||
3704 error "(5) Fail to start LFSCK for namespace"
3706 wait_all_targets_blocked namespace completed 6
3708 local repaired=$($SHOW_NAMESPACE |
3709 awk '/^unmatched_pairs_repaired/ { print $2 }')
3710 [ $repaired -eq 1 ] ||
3711 error "(7) Fail to repair unmatched pairs: $repaired"
3713 echo "'ls' should success after namespace LFSCK repairing"
3714 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3715 error "(8) ls should success."
3717 run_test 22a "LFSCK can repair unmatched pairs (1)"
3720 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3723 echo "The parent_A references the child directory via the name entry_B,"
3724 echo "but the child directory back references another parent_C via its"
3725 echo "".." name entry. The parent_C exists, but there is no the name"
3726 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3727 echo "the child directory's ".." name entry and its linkEA."
3730 check_mount_and_prep
3732 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3733 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3735 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3736 echo "and bad linkEA. The dummy's dotdot name entry references the"
3737 echo "guard. The dummy's linkEA references n non-exist name entry."
3738 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3739 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3740 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3741 error "(3) Fail to mkdir on MDT0"
3742 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3744 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3745 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3746 local dummyname=$($LFS fid2path $DIR $dummyfid)
3747 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3748 error "(4) fid2path works unexpectedly."
3750 echo "Trigger namespace LFSCK to repair unmatched pairs"
3751 $START_NAMESPACE -A -r ||
3752 error "(5) Fail to start LFSCK for namespace"
3754 wait_all_targets_blocked namespace completed 6
3756 local repaired=$($SHOW_NAMESPACE |
3757 awk '/^unmatched_pairs_repaired/ { print $2 }')
3758 [ $repaired -eq 1 ] ||
3759 error "(7) Fail to repair unmatched pairs: $repaired"
3761 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3762 local dummyname=$($LFS fid2path $DIR $dummyfid)
3763 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3764 error "(8) fid2path does not work"
3766 run_test 22b "LFSCK can repair unmatched pairs (2)"
3769 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3772 echo "The name entry is there, but the MDT-object for such name "
3773 echo "entry does not exist. The namespace LFSCK should find out "
3774 echo "and repair the inconsistency as required."
3777 check_mount_and_prep
3779 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3780 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3782 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3783 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3784 do_facet mds2 $LCTL set_param fail_loc=0x1620
3785 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3786 do_facet mds2 $LCTL set_param fail_loc=0
3788 echo "'ls' should fail because of dangling name entry"
3789 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3791 echo "Trigger namespace LFSCK to find out dangling name entry"
3792 $START_NAMESPACE -A -r ||
3793 error "(5) Fail to start LFSCK for namespace"
3795 wait_all_targets_blocked namespace completed 6
3797 local repaired=$($SHOW_NAMESPACE |
3798 awk '/^dangling_repaired/ { print $2 }')
3799 [ $repaired -eq 1 ] ||
3800 error "(7) Fail to repair dangling name entry: $repaired"
3802 echo "'ls' should fail because not re-create MDT-object by default"
3803 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3805 echo "Trigger namespace LFSCK again to repair dangling name entry"
3806 $START_NAMESPACE -A -r -C ||
3807 error "(9) Fail to start LFSCK for namespace"
3809 wait_all_targets_blocked namespace completed 10
3811 repaired=$($SHOW_NAMESPACE |
3812 awk '/^dangling_repaired/ { print $2 }')
3813 [ $repaired -eq 1 ] ||
3814 error "(11) Fail to repair dangling name entry: $repaired"
3816 echo "'ls' should success after namespace LFSCK repairing"
3817 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3819 run_test 23a "LFSCK can repair dangling name entry (1)"
3823 echo "The objectA has multiple hard links, one of them corresponding"
3824 echo "to the name entry_B. But there is something wrong for the name"
3825 echo "entry_B and cause entry_B to references non-exist object_C."
3826 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3827 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3828 echo "comes to the second-stage scanning, it will find that the"
3829 echo "former re-creating object_C is not proper, and will try to"
3830 echo "replace the object_C with the real object_A."
3833 check_mount_and_prep
3835 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3836 $LFS path2fid $DIR/$tdir/d0
3838 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3840 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3841 $LFS path2fid $DIR/$tdir/d0/f0
3843 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3844 $LFS path2fid $DIR/$tdir/d0/f1
3846 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3847 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3849 if [ "$SEQ0" != "$SEQ1" ]; then
3850 # To guarantee that the f0 and f1 are in the same FID seq
3851 rm -f $DIR/$tdir/d0/f0 ||
3852 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3853 echo "dummy" > $DIR/$tdir/d0/f0 ||
3854 error "(3.2) Fail to touch on MDT0"
3855 $LFS path2fid $DIR/$tdir/d0/f0
3858 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3859 OID=$(printf %d $OID)
3861 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3862 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3863 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3864 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3865 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3867 # If there is creation after the dangling injection, it may re-use
3868 # the just released local object (inode) that is referenced by the
3869 # dangling name entry. It will fail the dangling injection.
3870 # So before deleting the target object for the dangling name entry,
3871 # remove some other objects to avoid the target object being reused
3872 # by some potential creations. LU-7429
3873 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3875 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3877 echo "'ls' should fail because of dangling name entry"
3878 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3879 error "(6) ls should fail."
3881 echo "Trigger namespace LFSCK to find out dangling name entry"
3882 $START_NAMESPACE -r -C ||
3883 error "(7) Fail to start LFSCK for namespace"
3885 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3886 mdd.${MDT_DEV}.lfsck_namespace |
3887 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3889 error "(8) unexpected status"
3892 local repaired=$($SHOW_NAMESPACE |
3893 awk '/^dangling_repaired/ { print $2 }')
3894 [ $repaired -eq 1 ] ||
3895 error "(9) Fail to repair dangling name entry: $repaired"
3897 repaired=$($SHOW_NAMESPACE |
3898 awk '/^multiple_linked_repaired/ { print $2 }')
3899 [ $repaired -eq 1 ] ||
3900 error "(10) Fail to drop the former created object: $repaired"
3902 local data=$(cat $DIR/$tdir/d0/foo)
3903 [ "$data" == "dummy" ] ||
3904 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3906 run_test 23b "LFSCK can repair dangling name entry (2)"
3909 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3910 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3911 mdd.${MDT_DEV}.lfsck_namespace |
3912 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3914 error "(10) unexpected status"
3917 stop_full_debug_logging
3922 echo "The objectA has multiple hard links, one of them corresponding"
3923 echo "to the name entry_B. But there is something wrong for the name"
3924 echo "entry_B and cause entry_B to references non-exist object_C."
3925 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3926 echo "as dangling, and re-create the lost object_C. And then others"
3927 echo "modified the re-created object_C. When the LFSCK comes to the"
3928 echo "second-stage scanning, it will find that the former re-creating"
3929 echo "object_C maybe wrong and try to replace the object_C with the"
3930 echo "real object_A. But because object_C has been modified, so the"
3931 echo "LFSCK cannot replace it."
3934 start_full_debug_logging
3936 check_mount_and_prep
3938 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3939 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3940 echo "parent_fid=$parent_fid"
3942 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3944 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3945 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3946 echo "f0_fid=$f0_fid"
3948 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3949 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3950 echo "f1_fid=$f1_fid"
3952 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3953 # To guarantee that the f0 and f1 are in the same FID seq
3954 rm -f $DIR/$tdir/d0/f0 ||
3955 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3956 echo "dummy" > $DIR/$tdir/d0/f0 ||
3957 error "(3.2) Fail to touch on MDT0"
3958 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3959 echo "f0_fid=$f0_fid (replaced)"
3962 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3964 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3965 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3966 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3967 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3968 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3970 # If there is creation after the dangling injection, it may re-use
3971 # the just released local object (inode) that is referenced by the
3972 # dangling name entry. It will fail the dangling injection.
3973 # So before deleting the target object for the dangling name entry,
3974 # remove some other objects to avoid the target object being reused
3975 # by some potential creations. LU-7429
3976 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3978 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3980 echo "'ls' should fail because of dangling name entry"
3981 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3982 error "(6) ls should fail."
3984 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3985 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3987 echo "Trigger namespace LFSCK to find out dangling name entry"
3988 $START_NAMESPACE -r -C ||
3989 error "(7) Fail to start LFSCK for namespace"
3991 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3992 # While unexpected by the test, it is valid for LFSCK to repair
3993 # the link to the original object before any data is written.
3994 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3996 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3997 log "LFSCK repaired file prematurely"
4002 stat $DIR/$tdir/d0/foo
4004 error "(8) unexpected size"
4007 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4008 cancel_lru_locks osc
4012 local repaired=$($SHOW_NAMESPACE |
4013 awk '/^dangling_repaired/ { print $2 }')
4014 [ $repaired -eq 1 ] ||
4015 error "(11) Fail to repair dangling name entry: $repaired"
4017 local data=$(cat $DIR/$tdir/d0/foo)
4018 [ "$data" != "dummy" ] ||
4019 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4021 run_test 23c "LFSCK can repair dangling name entry (3)"
4024 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4025 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4028 echo "Two MDT-objects back reference the same name entry via their"
4029 echo "each own linkEA entry, but the name entry only references one"
4030 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4031 echo "for the MDT-object that is not recognized. If such MDT-object"
4032 echo "has no other linkEA entry after the removing, then the LFSCK"
4033 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4036 check_mount_and_prep
4038 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4040 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4041 $LFS path2fid $DIR/$tdir/d0/guard
4043 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4044 $LFS path2fid $DIR/$tdir/d0/dummy
4047 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4048 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4050 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4053 touch $DIR/$tdir/d0/guard/foo ||
4054 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4056 echo "Inject failure stub on MDT0 to simulate the case that"
4057 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4058 echo "that references $DIR/$tdir/d0/guard/foo."
4059 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4060 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4061 echo "there with the same linkEA entry as another MDT-object"
4062 echo "$DIR/$tdir/d0/guard/foo has"
4064 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4066 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4067 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4068 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4069 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4070 rmdir $DIR/$tdir/d0/dummy/foo ||
4071 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4072 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4074 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4075 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4076 error "(6) stat successfully unexpectedly"
4078 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4079 $START_NAMESPACE -A -r ||
4080 error "(7) Fail to start LFSCK for namespace"
4082 wait_all_targets_blocked namespace completed 8
4084 local repaired=$($SHOW_NAMESPACE |
4085 awk '/^multiple_referenced_repaired/ { print $2 }')
4086 [ $repaired -eq 1 ] ||
4087 error "(9) Fail to repair multiple referenced name entry: $repaired"
4089 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4090 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4091 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4093 local cname="$cfid-$pfid-D-0"
4094 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4095 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4097 run_test 24 "LFSCK can repair multiple-referenced name entry"
4100 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4101 skip "ldiskfs only test" && return
4104 echo "The file type in the name entry does not match the file type"
4105 echo "claimed by the referenced object. Then the LFSCK will update"
4106 echo "the file type in the name entry."
4109 check_mount_and_prep
4111 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4113 echo "Inject failure stub on MDT0 to simulate the case that"
4114 echo "the file type stored in the name entry is wrong."
4116 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4117 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4118 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4119 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4121 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4122 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4124 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4125 mdd.${MDT_DEV}.lfsck_namespace |
4126 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4128 error "(4) unexpected status"
4131 local repaired=$($SHOW_NAMESPACE |
4132 awk '/^bad_file_type_repaired/ { print $2 }')
4133 [ $repaired -eq 1 ] ||
4134 error "(5) Fail to repair bad file type in name entry: $repaired"
4136 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4138 run_test 25 "LFSCK can repair bad file type in the name entry"
4142 echo "The local name entry back referenced by the MDT-object is lost."
4143 echo "The namespace LFSCK will add the missing local name entry back"
4144 echo "to the normal namespace."
4147 check_mount_and_prep
4149 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4150 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4151 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4153 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4154 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4156 echo "Inject failure stub on MDT0 to simulate the case that"
4157 echo "foo's name entry will be removed, but the foo's object"
4158 echo "and its linkEA are kept in the system."
4160 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4161 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4162 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4163 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4165 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4166 error "(5) 'ls' should fail"
4168 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4169 $START_NAMESPACE -r -A ||
4170 error "(6) Fail to start LFSCK for namespace"
4172 wait_all_targets_blocked namespace completed 7
4174 local repaired=$($SHOW_NAMESPACE |
4175 awk '/^lost_dirent_repaired/ { print $2 }')
4176 [ $repaired -eq 1 ] ||
4177 error "(8) Fail to repair lost dirent: $repaired"
4179 ls -ail $DIR/$tdir/d0/foo ||
4180 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4182 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4183 [ "$foofid" == "$foofid2" ] ||
4184 error "(10) foo's FID changed: $foofid, $foofid2"
4186 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4189 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4192 echo "The remote name entry back referenced by the MDT-object is lost."
4193 echo "The namespace LFSCK will add the missing remote name entry back"
4194 echo "to the normal namespace."
4197 check_mount_and_prep
4199 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4200 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4201 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4203 echo "Inject failure stub on MDT0 to simulate the case that"
4204 echo "foo's name entry will be removed, but the foo's object"
4205 echo "and its linkEA are kept in the system."
4207 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4208 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4209 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4212 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4213 error "(4) 'ls' should fail"
4215 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4216 $START_NAMESPACE -r -A ||
4217 error "(5) Fail to start LFSCK for namespace"
4219 wait_all_targets_blocked namespace completed 6
4221 local repaired=$($SHOW_NAMESPACE |
4222 awk '/^lost_dirent_repaired/ { print $2 }')
4223 [ $repaired -eq 1 ] ||
4224 error "(7) Fail to repair lost dirent: $repaired"
4226 ls -ail $DIR/$tdir/d0/foo ||
4227 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4229 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4230 [ "$foofid" == "$foofid2" ] ||
4231 error "(9) foo's FID changed: $foofid, $foofid2"
4233 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4236 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4239 echo "The local parent referenced by the MDT-object linkEA is lost."
4240 echo "The namespace LFSCK will re-create the lost parent as orphan."
4243 check_mount_and_prep
4245 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4246 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4247 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4248 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4250 echo "Inject failure stub on MDT0 to simulate the case that"
4251 echo "foo's name entry will be removed, but the foo's object"
4252 echo "and its linkEA are kept in the system. And then remove"
4253 echo "another hard link and the parent directory."
4255 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4257 rm -f $DIR/$tdir/d0/foo ||
4258 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4259 rm -f $DIR/$tdir/d0/dummy ||
4260 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4261 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4263 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4264 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4266 echo "Trigger namespace LFSCK to repair the lost parent"
4267 $START_NAMESPACE -r -A ||
4268 error "(6) Fail to start LFSCK for namespace"
4270 wait_all_targets_blocked namespace completed 7
4272 local repaired=$($SHOW_NAMESPACE |
4273 awk '/^lost_dirent_repaired/ { print $2 }')
4274 [ $repaired -eq 1 ] ||
4275 error "(8) Fail to repair lost dirent: $repaired"
4277 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4278 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4279 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4281 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4283 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4284 [ ! -z "$cname" ] ||
4285 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4287 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4290 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4291 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4294 echo "The remote parent referenced by the MDT-object linkEA is lost."
4295 echo "The namespace LFSCK will re-create the lost parent as orphan."
4298 check_mount_and_prep
4300 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4301 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4303 $LFS path2fid $DIR/$tdir/d0
4305 echo "Inject failure stub on MDT0 to simulate the case that"
4306 echo "foo's name entry will be removed, but the foo's object"
4307 echo "and its linkEA are kept in the system. And then remove"
4308 echo "the parent directory."
4310 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4311 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4312 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4315 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4316 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4318 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4319 $START_NAMESPACE -r -A ||
4320 error "(6) Fail to start LFSCK for namespace"
4322 wait_all_targets_blocked namespace completed 7
4324 local repaired=$($SHOW_NAMESPACE |
4325 awk '/^lost_dirent_repaired/ { print $2 }')
4326 [ $repaired -eq 1 ] ||
4327 error "(8) Fail to repair lost dirent: $repaired"
4329 ls -ail $MOUNT/.lustre/lost+found/
4331 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4332 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4333 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4335 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4337 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4338 [ ! -z "$cname" ] ||
4339 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4341 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4344 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4347 echo "The target name entry is lost. The LFSCK should insert the"
4348 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4349 echo "the MDT (on which the orphan MDT-object resides) has ever"
4350 echo "failed to respond some name entry verification during the"
4351 echo "first stage-scanning, then the LFSCK should skip to handle"
4352 echo "orphan MDT-object on this MDT. But other MDTs should not"
4356 check_mount_and_prep
4357 $LFS mkdir -i 0 $DIR/$tdir/d1
4358 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4359 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4361 $LFS mkdir -i 1 $DIR/$tdir/d2
4362 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4363 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4365 echo "Inject failure stub on MDT0 to simulate the case that"
4366 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4367 echo "and its linkEA are kept in the system. And the case that"
4368 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4369 echo "and its linkEA are kept in the system."
4371 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4372 do_facet mds1 $LCTL set_param fail_loc=0x1624
4373 do_facet mds2 $LCTL set_param fail_loc=0x1624
4374 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4375 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4376 do_facet mds1 $LCTL set_param fail_loc=0
4377 do_facet mds2 $LCTL set_param fail_loc=0
4379 cancel_lru_locks mdc
4380 cancel_lru_locks osc
4382 echo "Inject failure, to simulate the MDT0 fail to handle"
4383 echo "MDT1 LFSCK request during the first-stage scanning."
4384 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4385 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4387 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4388 $START_NAMESPACE -r -A ||
4389 error "(3) Fail to start LFSCK for namespace"
4391 wait_update_facet mds1 "$LCTL get_param -n \
4392 mdd.$(facet_svc mds1).lfsck_namespace |
4393 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4394 error "(4) mds1 is not the expected 'partial'"
4397 wait_update_facet mds2 "$LCTL get_param -n \
4398 mdd.$(facet_svc mds2).lfsck_namespace |
4399 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4400 error "(5) mds2 is not the expected 'completed'"
4403 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4405 local repaired=$(do_facet mds1 $LCTL get_param -n \
4406 mdd.$(facet_svc mds1).lfsck_namespace |
4407 awk '/^lost_dirent_repaired/ { print $2 }')
4408 [ $repaired -eq 0 ] ||
4409 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4411 repaired=$(do_facet mds2 $LCTL get_param -n \
4412 mdd.$(facet_svc mds2).lfsck_namespace |
4413 awk '/^lost_dirent_repaired/ { print $2 }')
4414 [ $repaired -eq 1 ] ||
4415 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4417 echo "Trigger namespace LFSCK on all devices again to cleanup"
4418 $START_NAMESPACE -r -A ||
4419 error "(8) Fail to start LFSCK for namespace"
4421 wait_all_targets_blocked namespace completed 9
4423 local repaired=$(do_facet mds1 $LCTL get_param -n \
4424 mdd.$(facet_svc mds1).lfsck_namespace |
4425 awk '/^lost_dirent_repaired/ { print $2 }')
4426 [ $repaired -eq 1 ] ||
4427 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4429 repaired=$(do_facet mds2 $LCTL get_param -n \
4430 mdd.$(facet_svc mds2).lfsck_namespace |
4431 awk '/^lost_dirent_repaired/ { print $2 }')
4432 [ $repaired -eq 0 ] ||
4433 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4435 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4439 echo "The object's nlink attribute is larger than the object's known"
4440 echo "name entries count. The LFSCK will repair the object's nlink"
4441 echo "attribute to match the known name entries count"
4444 check_mount_and_prep
4446 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4447 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4449 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4450 echo "nlink attribute is larger than its name entries count."
4452 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4454 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4455 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4456 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4458 cancel_lru_locks mdc
4459 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4460 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4462 echo "Trigger namespace LFSCK to repair the nlink count"
4463 $START_NAMESPACE -r -A ||
4464 error "(5) Fail to start LFSCK for namespace"
4466 wait_all_targets_blocked namespace completed 6
4468 local repaired=$($SHOW_NAMESPACE |
4469 awk '/^nlinks_repaired/ { print $2 }')
4470 [ $repaired -eq 1 ] ||
4471 error "(7) Fail to repair nlink count: $repaired"
4473 cancel_lru_locks mdc
4474 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4475 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4477 # Disable 29a, we only allow nlink to be updated if the known linkEA
4478 # entries is larger than nlink count.
4480 #run_test 29a "LFSCK can repair bad nlink count (1)"
4484 echo "The object's nlink attribute is smaller than the object's known"
4485 echo "name entries count. The LFSCK will repair the object's nlink"
4486 echo "attribute to match the known name entries count"
4489 check_mount_and_prep
4491 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4492 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4494 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4495 echo "nlink attribute is smaller than its name entries count."
4497 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4499 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4500 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4503 cancel_lru_locks mdc
4504 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4505 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4507 echo "Trigger namespace LFSCK to repair the nlink count"
4508 $START_NAMESPACE -r -A ||
4509 error "(5) Fail to start LFSCK for namespace"
4511 wait_all_targets_blocked namespace completed 6
4513 local repaired=$($SHOW_NAMESPACE |
4514 awk '/^nlinks_repaired/ { print $2 }')
4515 [ $repaired -eq 1 ] ||
4516 error "(7) Fail to repair nlink count: $repaired"
4518 cancel_lru_locks mdc
4519 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4520 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4522 run_test 29b "LFSCK can repair bad nlink count (2)"
4527 echo "The namespace LFSCK will create many hard links to the target"
4528 echo "file as to exceed the linkEA size limitation. Under such case"
4529 echo "the linkEA will be marked as overflow that will prevent the"
4530 echo "target file to be migrated. Then remove some hard links to"
4531 echo "make the left hard links to be held within the linkEA size"
4532 echo "limitation. But before the namespace LFSCK adding all the"
4533 echo "missed linkEA entries back, the overflow mark (timestamp)"
4534 echo "will not be cleared."
4537 check_mount_and_prep
4539 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4540 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4541 error "(0.2) Fail to mkdir"
4542 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4543 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4545 # define MAX_LINKEA_SIZE 4096
4546 # sizeof(link_ea_header) = 24
4547 # sizeof(link_ea_entry) = 18
4548 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4549 # (sizeof(link_ea_entry) + name_length))
4550 # If the average name length is 12 bytes, then 150 hard links
4551 # is totally enough to overflow the linkEA
4552 echo "Create 150 hard links should succeed although the linkEA overflow"
4553 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4554 error "(2) Fail to hard link"
4556 cancel_lru_locks mdc
4557 if [ $MDSCOUNT -ge 2 ]; then
4558 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4559 error "(3.1) Migrate should fail"
4561 echo "The object with linkEA overflow should NOT be migrated"
4562 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4563 [ "$newfid" == "$oldfid" ] ||
4564 error "(3.2) Migrate should fail: $newfid != $oldfid"
4567 # Remove 100 hard links, then the linkEA should have space
4568 # to hold the missed linkEA entries.
4569 echo "Remove 100 hard links to save space for the missed linkEA entries"
4570 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4572 if [ $MDSCOUNT -ge 2 ]; then
4573 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4574 error "(5.1) Migrate should fail"
4576 # The overflow timestamp is still there, so migration will fail.
4577 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4578 [ "$newfid" == "$oldfid" ] ||
4579 error "(5.2) Migrate should fail: $newfid != $oldfid"
4582 # sleep 3 seconds to guarantee that the overflow is recognized
4585 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4586 $START_NAMESPACE -r -A ||
4587 error "(6) Fail to start LFSCK for namespace"
4589 wait_all_targets_blocked namespace completed 7
4591 local repaired=$($SHOW_NAMESPACE |
4592 awk '/^linkea_overflow_cleared/ { print $2 }')
4593 [ $repaired -eq 1 ] ||
4594 error "(8) Fail to clear linkea overflow: $repaired"
4596 repaired=$($SHOW_NAMESPACE |
4597 awk '/^nlinks_repaired/ { print $2 }')
4598 [ $repaired -eq 0 ] ||
4599 error "(9) Unexpected nlink repaired: $repaired"
4601 if [ $MDSCOUNT -ge 2 ]; then
4602 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4603 error "(10.1) Migrate failure"
4605 # Migration should succeed after clear the overflow timestamp.
4606 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4607 [ "$newfid" != "$oldfid" ] ||
4608 error "(10.2) Migrate should succeed"
4610 ls -l $DIR/$tdir/foo > /dev/null ||
4611 error "(11) 'ls' failed after migration"
4614 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4615 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4617 run_test 29c "verify linkEA size limitation"
4620 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4621 skip "ldiskfs only test" && return
4622 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4625 echo "The namespace LFSCK will move the orphans from backend"
4626 echo "/lost+found directory to normal client visible namespace"
4627 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4630 check_mount_and_prep
4632 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4633 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4635 echo "Inject failure stub on MDT0 to simulate the case that"
4636 echo "directory d0 has no linkEA entry, then the LFSCK will"
4637 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4639 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4641 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4644 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4645 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4647 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4648 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4650 echo "Inject failure stub on MDT0 to simulate the case that the"
4651 echo "object's name entry will be removed, but not destroy the"
4652 echo "object. Then backend e2fsck will handle it as orphan and"
4653 echo "add them into the backend /lost+found directory."
4655 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4656 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4657 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4658 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4659 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4660 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4661 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4663 umount_client $MOUNT || error "(10) Fail to stop client!"
4665 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4668 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4669 error "(12) Fail to run e2fsck"
4671 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4672 error "(13) Fail to start MDT0"
4674 echo "Trigger namespace LFSCK to recover backend orphans"
4675 $START_NAMESPACE -r -A ||
4676 error "(14) Fail to start LFSCK for namespace"
4678 wait_all_targets_blocked namespace completed 15
4680 local repaired=$($SHOW_NAMESPACE |
4681 awk '/^local_lost_found_moved/ { print $2 }')
4682 [ $repaired -ge 4 ] ||
4683 error "(16) Fail to recover backend orphans: $repaired"
4685 mount_client $MOUNT || error "(17) Fail to start client!"
4687 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4689 ls -ail $MOUNT/.lustre/lost+found/
4691 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4692 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4693 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4695 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4697 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4698 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4700 stat ${cname}/d1 || error "(21) d1 is not recovered"
4701 stat ${cname}/f1 || error "(22) f1 is not recovered"
4703 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4706 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4709 echo "For the name entry under a striped directory, if the name"
4710 echo "hash does not match the shard, then the LFSCK will repair"
4711 echo "the bad name entry"
4714 check_mount_and_prep
4716 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4717 error "(1) Fail to create striped directory"
4719 echo "Inject failure stub on client to simulate the case that"
4720 echo "some name entry should be inserted into other non-first"
4721 echo "shard, but inserted into the first shard by wrong"
4723 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4724 $LCTL set_param fail_loc=0x1628 fail_val=0
4725 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4726 error "(2) Fail to create file under striped directory"
4727 $LCTL set_param fail_loc=0 fail_val=0
4729 echo "Trigger namespace LFSCK to repair bad name hash"
4730 $START_NAMESPACE -r -A ||
4731 error "(3) Fail to start LFSCK for namespace"
4733 wait_all_targets_blocked namespace completed 4
4735 local repaired=$($SHOW_NAMESPACE |
4736 awk '/^name_hash_repaired/ { print $2 }')
4737 [ $repaired -ge 1 ] ||
4738 error "(5) Fail to repair bad name hash: $repaired"
4740 umount_client $MOUNT || error "(6) umount failed"
4741 mount_client $MOUNT || error "(7) mount failed"
4743 for ((i = 0; i < $MDSCOUNT; i++)); do
4744 stat $DIR/$tdir/striped_dir/d$i ||
4745 error "(8) Fail to stat d$i after LFSCK"
4746 rmdir $DIR/$tdir/striped_dir/d$i ||
4747 error "(9) Fail to unlink d$i after LFSCK"
4750 rmdir $DIR/$tdir/striped_dir ||
4751 error "(10) Fail to remove the striped directory after LFSCK"
4753 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4756 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4759 echo "For the name entry under a striped directory, if the name"
4760 echo "hash does not match the shard, then the LFSCK will repair"
4761 echo "the bad name entry"
4764 check_mount_and_prep
4766 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4767 error "(1) Fail to create striped directory"
4769 echo "Inject failure stub on client to simulate the case that"
4770 echo "some name entry should be inserted into other non-second"
4771 echo "shard, but inserted into the secod shard by wrong"
4773 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4774 $LCTL set_param fail_loc=0x1628 fail_val=1
4775 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4776 error "(2) Fail to create file under striped directory"
4777 $LCTL set_param fail_loc=0 fail_val=0
4779 echo "Trigger namespace LFSCK to repair bad name hash"
4780 $START_NAMESPACE -r -A ||
4781 error "(3) Fail to start LFSCK for namespace"
4783 wait_all_targets_blocked namespace completed 4
4785 local repaired=$(do_facet mds2 $LCTL get_param -n \
4786 mdd.$(facet_svc mds2).lfsck_namespace |
4787 awk '/^name_hash_repaired/ { print $2 }')
4788 [ $repaired -ge 1 ] ||
4789 error "(5) Fail to repair bad name hash: $repaired"
4791 umount_client $MOUNT || error "(6) umount failed"
4792 mount_client $MOUNT || error "(7) mount failed"
4794 for ((i = 0; i < $MDSCOUNT; i++)); do
4795 stat $DIR/$tdir/striped_dir/d$i ||
4796 error "(8) Fail to stat d$i after LFSCK"
4797 rmdir $DIR/$tdir/striped_dir/d$i ||
4798 error "(9) Fail to unlink d$i after LFSCK"
4801 rmdir $DIR/$tdir/striped_dir ||
4802 error "(10) Fail to remove the striped directory after LFSCK"
4804 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4807 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4810 echo "For some reason, the master MDT-object of the striped directory"
4811 echo "may lost its master LMV EA. If nobody created files under the"
4812 echo "master directly after the master LMV EA lost, then the LFSCK"
4813 echo "should re-generate the master LMV EA."
4816 check_mount_and_prep
4818 echo "Inject failure stub on MDT0 to simulate the case that the"
4819 echo "master MDT-object of the striped directory lost the LMV EA."
4821 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4823 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4824 error "(1) Fail to create striped directory"
4825 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4827 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4828 $START_NAMESPACE -r -A ||
4829 error "(2) Fail to start LFSCK for namespace"
4831 wait_all_targets_blocked namespace completed 3
4833 local repaired=$($SHOW_NAMESPACE |
4834 awk '/^striped_dirs_repaired/ { print $2 }')
4835 [ $repaired -eq 1 ] ||
4836 error "(4) Fail to re-generate master LMV EA: $repaired"
4838 umount_client $MOUNT || error "(5) umount failed"
4839 mount_client $MOUNT || error "(6) mount failed"
4841 local empty=$(ls $DIR/$tdir/striped_dir/)
4842 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4844 rmdir $DIR/$tdir/striped_dir ||
4845 error "(8) Fail to remove the striped directory after LFSCK"
4847 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4850 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4853 echo "For some reason, the master MDT-object of the striped directory"
4854 echo "may lost its master LMV EA. If somebody created files under the"
4855 echo "master directly after the master LMV EA lost, then the LFSCK"
4856 echo "should NOT re-generate the master LMV EA, instead, it should"
4857 echo "change the broken striped dirctory as read-only to prevent"
4858 echo "further damage"
4861 check_mount_and_prep
4863 echo "Inject failure stub on MDT0 to simulate the case that the"
4864 echo "master MDT-object of the striped directory lost the LMV EA."
4866 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4867 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4868 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4869 error "(1) Fail to create striped directory"
4870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4872 umount_client $MOUNT || error "(2) umount failed"
4873 mount_client $MOUNT || error "(3) mount failed"
4875 touch $DIR/$tdir/striped_dir/dummy ||
4876 error "(4) Fail to touch under broken striped directory"
4878 echo "Trigger namespace LFSCK to find out the inconsistency"
4879 $START_NAMESPACE -r -A ||
4880 error "(5) Fail to start LFSCK for namespace"
4882 wait_all_targets_blocked namespace completed 6
4884 local repaired=$($SHOW_NAMESPACE |
4885 awk '/^striped_dirs_repaired/ { print $2 }')
4886 [ $repaired -eq 0 ] ||
4887 error "(7) Re-generate master LMV EA unexpected: $repaired"
4889 stat $DIR/$tdir/striped_dir/dummy ||
4890 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4892 touch $DIR/$tdir/striped_dir/foo &&
4893 error "(9) The broken striped directory should be read-only"
4895 chattr -i $DIR/$tdir/striped_dir ||
4896 error "(10) Fail to chattr on the broken striped directory"
4898 rmdir $DIR/$tdir/striped_dir ||
4899 error "(11) Fail to remove the striped directory after LFSCK"
4901 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4904 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4907 echo "For some reason, the slave MDT-object of the striped directory"
4908 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4909 echo "slave LMV EA."
4912 check_mount_and_prep
4914 echo "Inject failure stub on MDT0 to simulate the case that the"
4915 echo "slave MDT-object (that resides on the same MDT as the master"
4916 echo "MDT-object resides on) lost the LMV EA."
4918 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4919 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4920 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4921 error "(1) Fail to create striped directory"
4922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4924 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4925 $START_NAMESPACE -r -A ||
4926 error "(2) Fail to start LFSCK for namespace"
4928 wait_all_targets_blocked namespace completed 3
4930 local repaired=$($SHOW_NAMESPACE |
4931 awk '/^striped_shards_repaired/ { print $2 }')
4932 [ $repaired -eq 1 ] ||
4933 error "(4) Fail to re-generate slave LMV EA: $repaired"
4935 rmdir $DIR/$tdir/striped_dir ||
4936 error "(5) Fail to remove the striped directory after LFSCK"
4938 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4941 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4944 echo "For some reason, the slave MDT-object of the striped directory"
4945 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4946 echo "slave LMV EA."
4949 check_mount_and_prep
4951 echo "Inject failure stub on MDT0 to simulate the case that the"
4952 echo "slave MDT-object (that resides on different MDT as the master"
4953 echo "MDT-object resides on) lost the LMV EA."
4955 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4957 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4958 error "(1) Fail to create striped directory"
4959 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4961 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4962 $START_NAMESPACE -r -A ||
4963 error "(2) Fail to start LFSCK for namespace"
4965 wait_all_targets_blocked namespace completed 3
4967 local repaired=$(do_facet mds2 $LCTL get_param -n \
4968 mdd.$(facet_svc mds2).lfsck_namespace |
4969 awk '/^striped_shards_repaired/ { print $2 }')
4970 [ $repaired -eq 1 ] ||
4971 error "(4) Fail to re-generate slave LMV EA: $repaired"
4973 rmdir $DIR/$tdir/striped_dir ||
4974 error "(5) Fail to remove the striped directory after LFSCK"
4976 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4979 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4982 echo "For some reason, the stripe index in the slave LMV EA is"
4983 echo "corrupted. The LFSCK should repair the slave LMV EA."
4986 check_mount_and_prep
4988 echo "Inject failure stub on MDT0 to simulate the case that the"
4989 echo "slave LMV EA on the first shard of the striped directory"
4990 echo "claims the same index as the second shard claims"
4992 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4993 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4994 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4995 error "(1) Fail to create striped directory"
4996 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4998 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4999 $START_NAMESPACE -r -A ||
5000 error "(2) Fail to start LFSCK for namespace"
5002 wait_all_targets_blocked namespace completed 3
5004 local repaired=$($SHOW_NAMESPACE |
5005 awk '/^striped_shards_repaired/ { print $2 }')
5006 [ $repaired -eq 1 ] ||
5007 error "(4) Fail to repair slave LMV EA: $repaired"
5009 umount_client $MOUNT || error "(5) umount failed"
5010 mount_client $MOUNT || error "(6) mount failed"
5012 touch $DIR/$tdir/striped_dir/foo ||
5013 error "(7) Fail to touch file after the LFSCK"
5015 rm -f $DIR/$tdir/striped_dir/foo ||
5016 error "(8) Fail to unlink file after the LFSCK"
5018 rmdir $DIR/$tdir/striped_dir ||
5019 error "(9) Fail to remove the striped directory after LFSCK"
5021 run_test 31g "Repair the corrupted slave LMV EA"
5024 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5027 echo "For some reason, the shard's name entry in the striped"
5028 echo "directory may be corrupted. The LFSCK should repair the"
5029 echo "bad shard's name entry."
5032 check_mount_and_prep
5034 echo "Inject failure stub on MDT0 to simulate the case that the"
5035 echo "first shard's name entry in the striped directory claims"
5036 echo "the same index as the second shard's name entry claims."
5038 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5039 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5040 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5041 error "(1) Fail to create striped directory"
5042 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5044 echo "Trigger namespace LFSCK to repair the shard's name entry"
5045 $START_NAMESPACE -r -A ||
5046 error "(2) Fail to start LFSCK for namespace"
5048 wait_all_targets_blocked namespace completed 3
5050 local repaired=$($SHOW_NAMESPACE |
5051 awk '/^dirent_repaired/ { print $2 }')
5052 [ $repaired -eq 1 ] ||
5053 error "(4) Fail to repair shard's name entry: $repaired"
5055 umount_client $MOUNT || error "(5) umount failed"
5056 mount_client $MOUNT || error "(6) mount failed"
5058 touch $DIR/$tdir/striped_dir/foo ||
5059 error "(7) Fail to touch file after the LFSCK"
5061 rm -f $DIR/$tdir/striped_dir/foo ||
5062 error "(8) Fail to unlink file after the LFSCK"
5064 rmdir $DIR/$tdir/striped_dir ||
5065 error "(9) Fail to remove the striped directory after LFSCK"
5067 run_test 31h "Repair the corrupted shard's name entry"
5072 umount_client $MOUNT
5074 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5075 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5076 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5078 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5079 [ "$STATUS" == "scanning-phase1" ] ||
5080 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5083 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5085 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5089 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5091 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5092 error "(5) Fail to start ost1"
5094 run_test 32a "stop LFSCK when some OST failed"
5098 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5101 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5102 error "(1) Fail to create $DIR/$tdir/dp"
5103 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5104 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5105 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5106 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5107 umount_client $MOUNT
5109 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5110 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5111 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5113 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5114 mdd.${MDT_DEV}.lfsck_namespace |
5115 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5117 error "(5) unexpected status"
5121 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5127 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5129 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5130 error "(8) Fail to start MDT2"
5132 run_test 32b "stop LFSCK when some MDT failed"
5138 $START_LAYOUT --dryrun -o -r ||
5139 error "(1) Fail to start layout LFSCK"
5140 wait_all_targets_blocked layout completed 2
5142 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5143 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5144 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5146 $START_NAMESPACE -e abort -A -r ||
5147 error "(4) Fail to start namespace LFSCK"
5148 wait_all_targets_blocked namespace completed 5
5150 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5151 [ "$PARAMS" == "failout,all_targets" ] ||
5152 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5154 run_test 33 "check LFSCK paramters"
5158 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5159 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5160 skip "Only valid for ZFS backend" && return
5164 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5166 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5167 error "(1) Fail to create $DIR/$tdir/dummy"
5169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5170 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5171 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5172 mdd.${MDT_DEV}.lfsck_namespace |
5173 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5175 error "(3) unexpected status"
5178 local repaired=$($SHOW_NAMESPACE |
5179 awk '/^dirent_repaired/ { print $2 }')
5180 [ $repaired -eq 1 ] ||
5181 error "(4) Fail to repair the lost agent object: $repaired"
5183 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5184 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5185 mdd.${MDT_DEV}.lfsck_namespace |
5186 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5188 error "(6) unexpected status"
5191 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5192 [ $repaired -eq 0 ] ||
5193 error "(7) Unexpected repairing: $repaired"
5195 run_test 34 "LFSCK can rebuild the lost agent object"
5199 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5203 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5204 do_facet mds2 $LCTL set_param fail_loc=0x1631
5205 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5206 error "(1) Fail to create $DIR/$tdir/dummy"
5209 do_facet mds2 $LCTL set_param fail_loc=0
5210 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5211 wait_update_facet mds2 "$LCTL get_param -n \
5212 mdd.$(facet_svc mds2).lfsck_namespace |
5213 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5214 error "(3) MDS${k} is not the expected 'completed'"
5216 local repaired=$(do_facet mds2 $LCTL get_param -n \
5217 mdd.$(facet_svc mds2).lfsck_namespace |
5218 awk '/^agent_entries_repaired/ { print $2 }')
5219 [ $repaired -eq 1 ] ||
5220 error "(4) Fail to repair the lost agent entry: $repaired"
5222 echo "stopall to cleanup object cache"
5225 setupall > /dev/null
5227 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5228 wait_update_facet mds2 "$LCTL get_param -n \
5229 mdd.$(facet_svc mds2).lfsck_namespace |
5230 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5231 error "(6) MDS${k} is not the expected 'completed'"
5233 repaired=$(do_facet mds2 $LCTL get_param -n \
5234 mdd.$(facet_svc mds2).lfsck_namespace |
5235 awk '/^agent_entries_repaired/ { print $2 }')
5236 [ $repaired -eq 0 ] ||
5237 error "(7) Unexpected repairing: $repaired"
5239 run_test 35 "LFSCK can rebuild the lost agent entry"
5242 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5245 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5246 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5247 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5250 check_mount_and_prep
5254 lctl get_param osc.*.*grant*
5255 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5257 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5258 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5259 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5260 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5261 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5262 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5263 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5264 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5265 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5267 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5268 error "(3) Fail to write $DIR/$tdir/f0"
5269 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5270 error "(4) Fail to write $DIR/$tdir/f1"
5271 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5272 error "(5) Fail to write $DIR/$tdir/f2"
5274 $LFS mirror resync $DIR/$tdir/f0 ||
5275 error "(6) Fail to resync $DIR/$tdir/f0"
5276 $LFS mirror resync $DIR/$tdir/f1 ||
5277 error "(7) Fail to resync $DIR/$tdir/f1"
5278 $LFS mirror resync $DIR/$tdir/f2 ||
5279 error "(8) Fail to resync $DIR/$tdir/f2"
5281 cancel_lru_locks mdc
5282 cancel_lru_locks osc
5284 $LFS getstripe $DIR/$tdir/f0 ||
5285 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5286 $LFS getstripe $DIR/$tdir/f1 ||
5287 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5288 $LFS getstripe $DIR/$tdir/f2 ||
5289 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5291 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5292 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5293 do_facet mds1 $LCTL set_param fail_loc=0x1616
5295 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5296 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5297 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5298 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5299 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5300 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5304 do_facet mds1 $LCTL set_param fail_loc=0
5306 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5307 error "(15) The 1st of mirror is not destroyed"
5308 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5309 error "(16) The 2nd of mirror is not destroyed"
5310 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5311 error "(17) The 3rd of mirror is not destroyed"
5315 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5316 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5317 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5318 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5319 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5320 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5322 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5323 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5325 for k in $(seq $MDSCOUNT); do
5326 # The LFSCK status query internal is 30 seconds. For the case
5327 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5328 # time to guarantee the status sync up.
5329 wait_update_facet mds${k} "$LCTL get_param -n \
5330 mdd.$(facet_svc mds${k}).lfsck_layout |
5331 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5332 error "(22) MDS${k} is not the expected 'completed'"
5335 for k in $(seq $OSTCOUNT); do
5336 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5337 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5338 awk '/^status/ { print $2 }')
5339 [ "$cur_status" == "completed" ] ||
5340 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5343 local repaired=$(do_facet mds1 $LCTL get_param -n \
5344 mdd.$(facet_svc mds1).lfsck_layout |
5345 awk '/^repaired_orphan/ { print $2 }')
5346 [ $repaired -eq 9 ] ||
5347 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5349 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5350 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5351 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5352 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5353 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5354 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5356 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5357 $LFS getstripe $DIR/$tdir/f0
5358 error "(28) The 1st of mirror is not recovered"
5361 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5362 $LFS getstripe $DIR/$tdir/f1
5363 error "(29) The 2nd of mirror is not recovered"
5366 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5367 $LFS getstripe $DIR/$tdir/f2
5368 error "(30) The 3rd of mirror is not recovered"
5371 run_test 36a "rebuild LOV EA for mirrored file (1)"
5374 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5375 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5378 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5379 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5380 echo "with the PFID EA of related OST-object(s) belong to the file. "
5383 check_mount_and_prep
5385 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5386 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5387 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5389 local fid=$($LFS path2fid $DIR/$tdir/f0)
5391 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5392 error "(1) Fail to write $DIR/$tdir/f0"
5393 $LFS mirror resync $DIR/$tdir/f0 ||
5394 error "(2) Fail to resync $DIR/$tdir/f0"
5396 cancel_lru_locks mdc
5397 cancel_lru_locks osc
5399 $LFS getstripe $DIR/$tdir/f0 ||
5400 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5402 echo "Inject failure, to simulate the case of missing the MDT-object"
5403 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5404 do_facet mds1 $LCTL set_param fail_loc=0x1616
5405 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5409 do_facet mds1 $LCTL set_param fail_loc=0
5411 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5412 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5414 for k in $(seq $MDSCOUNT); do
5415 # The LFSCK status query internal is 30 seconds. For the case
5416 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5417 # time to guarantee the status sync up.
5418 wait_update_facet mds${k} "$LCTL get_param -n \
5419 mdd.$(facet_svc mds${k}).lfsck_layout |
5420 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5421 error "(6) MDS${k} is not the expected 'completed'"
5424 for k in $(seq $OSTCOUNT); do
5425 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5426 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5427 awk '/^status/ { print $2 }')
5428 [ "$cur_status" == "completed" ] ||
5429 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5432 local count=$(do_facet mds1 $LCTL get_param -n \
5433 mdd.$(facet_svc mds1).lfsck_layout |
5434 awk '/^repaired_orphan/ { print $2 }')
5435 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5437 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5438 count=$($LFS getstripe --mirror-count $name)
5439 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5441 count=$($LFS getstripe --component-count $name)
5442 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5444 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5445 $LFS getstripe $name
5446 error "(11) The 1st of mirror is not recovered"
5449 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5450 $LFS getstripe $name
5451 error "(12) The 2nd of mirror is not recovered"
5454 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5455 $LFS getstripe $name
5456 error "(13) The 3rd of mirror is not recovered"
5459 run_test 36b "rebuild LOV EA for mirrored file (2)"
5462 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5463 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5466 echo "The mirrored file has been modified, not resynced yet, then "
5467 echo "lost its MDT-object, but relatd OST-objects are still there. "
5468 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5469 echo "with the PFID EA of related OST-object(s) belong to the file. "
5472 check_mount_and_prep
5474 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5476 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5478 local fid=$($LFS path2fid $DIR/$tdir/f0)
5480 # The 1st dd && resync makes all related OST-objects have been written
5481 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5482 error "(1.1) Fail to write $DIR/$tdir/f0"
5483 $LFS mirror resync $DIR/$tdir/f0 ||
5484 error "(1.2) Fail to resync $DIR/$tdir/f0"
5485 # The 2nd dd makes one mirror to be stale
5486 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5487 error "(1.3) Fail to write $DIR/$tdir/f0"
5489 cancel_lru_locks mdc
5490 cancel_lru_locks osc
5492 $LFS getstripe $DIR/$tdir/f0 ||
5493 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5495 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5496 awk '/lcme_flags/ { print $2 }')
5497 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5498 awk '/lcme_flags/ { print $2 }')
5500 echo "Inject failure, to simulate the case of missing the MDT-object"
5501 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5502 do_facet mds1 $LCTL set_param fail_loc=0x1616
5503 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5507 do_facet mds1 $LCTL set_param fail_loc=0
5509 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5510 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5512 for k in $(seq $MDSCOUNT); do
5513 # The LFSCK status query internal is 30 seconds. For the case
5514 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5515 # time to guarantee the status sync up.
5516 wait_update_facet mds${k} "$LCTL get_param -n \
5517 mdd.$(facet_svc mds${k}).lfsck_layout |
5518 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5519 error "(5) MDS${k} is not the expected 'completed'"
5522 for k in $(seq $OSTCOUNT); do
5523 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5524 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5525 awk '/^status/ { print $2 }')
5526 [ "$cur_status" == "completed" ] ||
5527 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5530 local count=$(do_facet mds1 $LCTL get_param -n \
5531 mdd.$(facet_svc mds1).lfsck_layout |
5532 awk '/^repaired_orphan/ { print $2 }')
5533 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5535 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5536 count=$($LFS getstripe --mirror-count $name)
5537 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5539 count=$($LFS getstripe --component-count $name)
5540 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5542 local flags=$($LFS getstripe $name | head -n 10 |
5543 awk '/lcme_flags/ { print $2 }')
5544 [ "$flags" == "$saved_flags1" ] || {
5545 $LFS getstripe $name
5546 error "(10) expect flags $saved_flags1, got $flags"
5549 flags=$($LFS getstripe $name | tail -n 10 |
5550 awk '/lcme_flags/ { print $2 }')
5551 [ "$flags" == "$saved_flags2" ] || {
5552 $LFS getstripe $name
5553 error "(11) expect flags $saved_flags2, got $flags"
5556 run_test 36c "rebuild LOV EA for mirrored file (3)"
5562 local t_dir="$DIR/$tdir/d0"
5563 check_mount_and_prep
5565 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5566 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5570 $START_NAMESPACE -r -A || {
5571 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5573 wait_all_targets_blocked namespace completed 4
5578 run_test 37 "LFSCK must skip a ORPHAN"
5582 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5583 skip "Need MDS version newer than 2.12.51"
5585 test_mkdir $DIR/$tdir
5586 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5587 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5589 # create foreign file
5590 $LFS setstripe --foreign=daos --flags 0xda05 \
5591 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5592 error "$DIR/$tdir/$tfile: create failed"
5594 $LFS getstripe -v $DIR/$tdir/$tfile |
5595 grep "lfm_magic:.*0x0BD70BD0" ||
5596 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5597 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5598 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5599 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5600 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5601 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5602 $LFS getstripe -v $DIR/$tdir/$tfile |
5603 grep "lfm_flags:.*0x0000DA05" ||
5604 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5605 $LFS getstripe $DIR/$tdir/$tfile |
5606 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5607 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5609 # modify striping should fail
5610 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5611 error "$DIR/$tdir/$tfile: setstripe should fail"
5613 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5615 wait_all_targets_blocked namespace completed 1
5617 # check that "global" namespace_repaired == 0 !!!
5618 local repaired=$(do_facet mds1 \
5619 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5620 awk '/^namespace_repaired/ { print \\\$2 }'")
5621 [ $repaired -eq 0 ] ||
5622 error "(2) Expect no namespace repair, but got: $repaired"
5624 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5626 wait_all_targets_blocked layout completed 2
5628 # check that "global" layout_repaired == 0 !!!
5629 local repaired=$(do_facet mds1 \
5630 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5631 awk '/^layout_repaired/ { print \\\$2 }'")
5632 [ $repaired -eq 0 ] ||
5633 error "(2) Expect no layout repair, but got: $repaired"
5635 echo "post-lfsck checks of foreign file"
5637 $LFS getstripe -v $DIR/$tdir/$tfile |
5638 grep "lfm_magic:.*0x0BD70BD0" ||
5639 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5640 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5641 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5642 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5643 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5644 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5645 $LFS getstripe -v $DIR/$tdir/$tfile |
5646 grep "lfm_flags:.*0x0000DA05" ||
5647 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5648 $LFS getstripe $DIR/$tdir/$tfile |
5649 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5650 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5652 # modify striping should fail
5653 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5654 error "$DIR/$tdir/$tfile: setstripe should fail"
5657 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5658 cat /etc/passwd > $DIR/$tdir/$tfile &&
5659 error "$DIR/$tdir/$tfile: write should fail"
5661 #remove foreign file
5662 rm $DIR/$tdir/$tfile ||
5663 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5665 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5669 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5670 skip "Need MDS version newer than 2.12.51"
5672 test_mkdir $DIR/$tdir
5673 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5674 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5676 # create foreign dir
5677 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5678 $DIR/$tdir/${tdir}2 ||
5679 error "$DIR/$tdir/${tdir}2: create failed"
5681 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5682 grep "lfm_magic:.*0x0CD50CD0" ||
5683 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5684 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5685 # - sizeof(lfm_type) - sizeof(lfm_flags)
5686 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5687 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5688 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5689 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5690 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5691 grep "lfm_flags:.*0x0000DA05" ||
5692 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5693 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5694 grep "lfm_value.*${uuid1}@${uuid2}" ||
5695 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5697 # file create in dir should fail
5698 touch $DIR/$tdir/${tdir}2/$tfile &&
5699 "$DIR/${tdir}2: file create should fail"
5702 chmod 777 $DIR/$tdir/${tdir}2 ||
5703 error "$DIR/${tdir}2: chmod failed"
5706 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5707 error "$DIR/${tdir}2: chown failed"
5709 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5711 wait_all_targets_blocked namespace completed 1
5713 # check that "global" namespace_repaired == 0 !!!
5714 local repaired=$(do_facet mds1 \
5715 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5716 awk '/^namespace_repaired/ { print \\\$2 }'")
5717 [ $repaired -eq 0 ] ||
5718 error "(2) Expect nothing to be repaired, but got: $repaired"
5720 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5722 wait_all_targets_blocked layout completed 2
5724 # check that "global" layout_repaired == 0 !!!
5725 local repaired=$(do_facet mds1 \
5726 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5727 awk '/^layout_repaired/ { print \\\$2 }'")
5728 [ $repaired -eq 0 ] ||
5729 error "(2) Expect no layout repair, but got: $repaired"
5731 echo "post-lfsck checks of foreign dir"
5733 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5734 grep "lfm_magic:.*0x0CD50CD0" ||
5735 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5736 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5737 # - sizeof(lfm_type) - sizeof(lfm_flags)
5738 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5739 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5740 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5741 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5742 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5743 grep "lfm_flags:.*0x0000DA05" ||
5744 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5745 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5746 grep "lfm_value.*${uuid1}@${uuid2}" ||
5747 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5749 # file create in dir should fail
5750 touch $DIR/$tdir/${tdir}2/$tfile &&
5751 "$DIR/${tdir}2: file create should fail"
5754 chmod 777 $DIR/$tdir/${tdir}2 ||
5755 error "$DIR/${tdir}2: chmod failed"
5758 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5759 error "$DIR/${tdir}2: chown failed"
5762 rmdir $DIR/$tdir/${tdir}2 ||
5763 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5765 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5767 # restore MDS/OST size
5768 MDSSIZE=${SAVED_MDSSIZE}
5769 OSTSIZE=${SAVED_OSTSIZE}
5770 OSTCOUNT=${SAVED_OSTCOUNT}
5772 # cleanup the system at last
5773 REFORMAT="yes" cleanup_and_setup_lustre
5776 check_and_cleanup_lustre