3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # DNE does not support striped directory on zfs-based backend yet.
19 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
20 #Bug number for excepting test
22 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
24 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
25 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
27 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
28 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
30 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
31 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
33 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
36 require_dsh_mds || exit 0
40 if ! check_versions; then
41 skip "It is NOT necessary to test lfsck under interoperation mode"
45 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
46 skip "Need MDS version at least 2.3.60" && exit 0
50 SAVED_MDSSIZE=${MDSSIZE}
51 SAVED_OSTSIZE=${OSTSIZE}
52 SAVED_OSTCOUNT=${OSTCOUNT}
53 # use small MDS + OST size to speed formatting time
54 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
56 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
58 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
60 # no need too many OSTs, to reduce the format/start/stop overhead
62 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
64 # build up a clean test environment.
65 REFORMAT="yes" check_and_setup_lustre
67 MDT_DEV="${FSNAME}-MDT0000"
68 OST_DEV="${FSNAME}-OST0000"
69 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
70 START_NAMESPACE="do_facet $SINGLEMDS \
71 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
72 START_LAYOUT="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
74 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
75 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
76 SHOW_NAMESPACE="do_facet $SINGLEMDS \
77 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
78 SHOW_LAYOUT="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
80 SHOW_LAYOUT_ON_OST="do_facet ost1 \
81 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
82 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
83 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
84 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
93 echo "preparing... $nfiles * $ndirs files will be created $(date)."
94 if [ ! -z $igif ]; then
95 #define OBD_FAIL_FID_IGIF 0x1504
96 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
99 cp $LUSTRE/tests/*.sh $DIR/$tdir/
100 if [ $ndirs -gt 0 ]; then
101 createmany -d $DIR/$tdir/d $ndirs
102 createmany -m $DIR/$tdir/f $ndirs
103 if [ $nfiles -gt 0 ]; then
104 for ((i = 0; i < $ndirs; i++)); do
105 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
106 /dev/null || error "createmany $nfiles"
109 createmany -d $DIR/$tdir/e $ndirs
112 if [ ! -z $igif ]; then
113 touch $DIR/$tdir/dummy
114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
117 echo "prepared $(date)."
120 run_e2fsck_on_mdt0() {
121 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
123 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
124 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
127 error "(2) Detected inconsistency on MDT0"
129 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
130 error "(3) Fail to start MDT0"
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS" && return
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
345 touch $DIR/$tdir/dummy
347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
349 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
350 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
351 mdd.${MDT_DEV}.lfsck_namespace |
352 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
354 error "(4) unexpected status"
357 local repaired=$($SHOW_NAMESPACE |
358 awk '/^linkea_repaired/ { print $2 }')
359 # for interop with old server
360 [ -z "$repaired" ] &&
361 repaired=$($SHOW_NAMESPACE |
362 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
422 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^updated_phase2/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
459 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
460 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
461 touch $DIR/$tdir/dummy
463 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
465 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
466 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
467 mdd.${MDT_DEV}.lfsck_namespace |
468 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
470 error "(4) unexpected status"
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
480 mount_client $MOUNT || error "(6) Fail to start client!"
482 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
483 error "(7) Fail to stat $DIR/$tdir/dummy"
485 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
486 local dummyname=$($LFS fid2path $DIR $dummyfid)
487 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
488 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
490 run_test 2d "LFSCK can recover the missing linkEA entry"
494 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
498 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
500 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
502 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
505 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
507 wait_all_targets_blocked namespace completed 4
509 local repaired=$($SHOW_NAMESPACE |
510 awk '/^linkea_repaired/ { print $2 }')
511 [ $repaired -eq 1 ] ||
512 error "(5) Fail to repair crashed linkEA: $repaired"
514 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
515 local name=$($LFS fid2path $DIR $fid)
516 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
517 error "(6) Fail to repair linkEA: $fid $name"
519 run_test 2e "namespace LFSCK can verify remote object linkEA"
525 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
526 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
527 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
529 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
530 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
531 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
533 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
534 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
535 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
537 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
539 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
543 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
548 error "(10) unexpected status"
551 local checked=$($SHOW_NAMESPACE |
552 awk '/^checked_phase2/ { print $2 }')
553 [ $checked -ge 4 ] ||
554 error "(11) Fail to check multiple-linked object: $checked"
556 local repaired=$($SHOW_NAMESPACE |
557 awk '/^multiple_linked_repaired/ { print $2 }')
558 [ $repaired -ge 2 ] ||
559 error "(12) Fail to repair multiple-linked object: $repaired"
561 run_test 3 "LFSCK can verify multiple-linked objects"
565 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
566 skip "OI Scrub not implemented for ZFS" && return
569 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
570 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
572 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
573 echo "start $SINGLEMDS with disabling OI scrub"
574 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
575 error "(2) Fail to start MDS!"
577 #define OBD_FAIL_LFSCK_DELAY2 0x1601
578 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
579 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
580 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
581 mdd.${MDT_DEV}.lfsck_namespace |
582 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
584 error "(5) unexpected status"
587 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
588 [ "$STATUS" == "scanning-phase1" ] ||
589 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
593 mdd.${MDT_DEV}.lfsck_namespace |
594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
596 error "(7) unexpected status"
599 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
600 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
602 local repaired=$($SHOW_NAMESPACE |
603 awk '/^dirent_repaired/ { print $2 }')
604 # for interop with old server
605 [ -z "$repaired" ] &&
606 repaired=$($SHOW_NAMESPACE |
607 awk '/^updated_phase1/ { print $2 }')
609 [ $repaired -ge 9 ] ||
610 error "(9) Fail to re-generate FID-in-dirent: $repaired"
614 mount_client $MOUNT || error "(10) Fail to start client!"
616 #define OBD_FAIL_FID_LOOKUP 0x1505
617 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
618 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
621 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
625 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
626 skip "OI Scrub not implemented for ZFS" && return
629 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
630 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
632 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
633 echo "start $SINGLEMDS with disabling OI scrub"
634 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
635 error "(2) Fail to start MDS!"
637 #define OBD_FAIL_LFSCK_DELAY2 0x1601
638 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
639 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
640 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
641 mdd.${MDT_DEV}.lfsck_namespace |
642 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
644 error "(5) unexpected status"
647 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
648 [ "$STATUS" == "scanning-phase1" ] ||
649 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
651 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
652 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
653 mdd.${MDT_DEV}.lfsck_namespace |
654 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
656 error "(7) unexpected status"
659 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
660 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
662 local repaired=$($SHOW_NAMESPACE |
663 awk '/^dirent_repaired/ { print $2 }')
664 # for interop with old server
665 [ -z "$repaired" ] &&
666 repaired=$($SHOW_NAMESPACE |
667 awk '/^updated_phase1/ { print $2 }')
669 [ $repaired -ge 2 ] ||
670 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
674 mount_client $MOUNT || error "(10) Fail to start client!"
676 #define OBD_FAIL_FID_LOOKUP 0x1505
677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
678 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
680 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
684 local dummyname=$($LFS fid2path $DIR $dummyfid)
685 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
686 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
688 run_test 5 "LFSCK can handle IGIF object upgrading"
693 #define OBD_FAIL_LFSCK_DELAY1 0x1600
694 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
695 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
697 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
698 [ "$STATUS" == "scanning-phase1" ] ||
699 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
701 # Sleep 3 sec to guarantee at least one object processed by LFSCK
703 # Fail the LFSCK to guarantee there is at least one checkpoint
704 #define OBD_FAIL_LFSCK_FATAL1 0x1608
705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
707 mdd.${MDT_DEV}.lfsck_namespace |
708 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
710 error "(4) unexpected status"
713 local POS0=$($SHOW_NAMESPACE |
714 awk '/^last_checkpoint_position/ { print $2 }' |
717 #define OBD_FAIL_LFSCK_DELAY1 0x1600
718 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
719 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
721 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
722 [ "$STATUS" == "scanning-phase1" ] ||
723 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
725 local POS1=$($SHOW_NAMESPACE |
726 awk '/^latest_start_position/ { print $2 }' |
728 [[ $POS0 -lt $POS1 ]] ||
729 error "(7) Expect larger than: $POS0, but got $POS1"
731 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
732 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
733 mdd.${MDT_DEV}.lfsck_namespace |
734 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
736 error "(8) unexpected status"
739 run_test 6a "LFSCK resumes from last checkpoint (1)"
744 #define OBD_FAIL_LFSCK_DELAY2 0x1601
745 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
746 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
748 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
749 [ "$STATUS" == "scanning-phase1" ] ||
750 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
752 # Sleep 5 sec to guarantee that we are in the directory scanning
754 # Fail the LFSCK to guarantee there is at least one checkpoint
755 #define OBD_FAIL_LFSCK_FATAL2 0x1609
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
761 error "(4) unexpected status"
764 local O_POS0=$($SHOW_NAMESPACE |
765 awk '/^last_checkpoint_position/ { print $2 }' |
768 local D_POS0=$($SHOW_NAMESPACE |
769 awk '/^last_checkpoint_position/ { print $4 }')
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
775 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
779 local O_POS1=$($SHOW_NAMESPACE |
780 awk '/^latest_start_position/ { print $2 }' |
782 local D_POS1=$($SHOW_NAMESPACE |
783 awk '/^latest_start_position/ { print $4 }')
785 echo "Additional debug for 6b"
787 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
788 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
789 [[ $O_POS0 -lt $O_POS1 ]] ||
790 error "(7.1) $O_POS1 is not larger than $O_POS0"
792 [[ $D_POS0 -lt $D_POS1 ]] ||
793 error "(7.2) $D_POS1 is not larger than $D_POS0"
796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
797 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
798 mdd.${MDT_DEV}.lfsck_namespace |
799 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
801 error "(8) unexpected status"
804 run_test 6b "LFSCK resumes from last checkpoint (2)"
811 #define OBD_FAIL_LFSCK_DELAY2 0x1601
812 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
813 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
815 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
816 [ "$STATUS" == "scanning-phase1" ] ||
817 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
819 # Sleep 3 sec to guarantee at least one object processed by LFSCK
821 echo "stop $SINGLEMDS"
822 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
825 echo "start $SINGLEMDS"
826 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
827 error "(5) Fail to start MDS!"
829 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
830 mdd.${MDT_DEV}.lfsck_namespace |
831 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
833 error "(6) unexpected status"
836 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
842 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
844 for ((i = 0; i < 20; i++)); do
845 touch $DIR/$tdir/dummy${i}
848 #define OBD_FAIL_LFSCK_DELAY3 0x1602
849 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
850 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
851 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
852 mdd.${MDT_DEV}.lfsck_namespace |
853 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
855 error "(4) unexpected status"
859 echo "stop $SINGLEMDS"
860 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
862 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
863 echo "start $SINGLEMDS"
864 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
865 error "(6) Fail to start MDS!"
867 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
868 mdd.${MDT_DEV}.lfsck_namespace |
869 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
871 error "(7) unexpected status"
874 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
879 formatall > /dev/null
885 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
886 [ "$STATUS" == "init" ] ||
887 error "(2) Expect 'init', but got '$STATUS'"
889 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
891 mkdir $DIR/$tdir/crashed
893 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
895 for ((i = 0; i < 5; i++)); do
896 touch $DIR/$tdir/dummy${i}
899 umount_client $MOUNT || error "(3) Fail to stop client!"
901 #define OBD_FAIL_LFSCK_DELAY2 0x1601
902 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
903 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
905 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
906 [ "$STATUS" == "scanning-phase1" ] ||
907 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
909 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
911 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
912 [ "$STATUS" == "stopped" ] ||
913 error "(7) Expect 'stopped', but got '$STATUS'"
915 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
917 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
918 [ "$STATUS" == "scanning-phase1" ] ||
919 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
921 #define OBD_FAIL_LFSCK_FATAL2 0x1609
922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
927 error "(10) unexpected status"
930 #define OBD_FAIL_LFSCK_DELAY1 0x1600
931 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
932 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
934 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
935 [ "$STATUS" == "scanning-phase1" ] ||
936 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
938 #define OBD_FAIL_LFSCK_CRASH 0x160a
939 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
942 echo "stop $SINGLEMDS"
943 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
945 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
948 echo "start $SINGLEMDS"
949 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
950 error "(14) Fail to start MDS!"
952 local timeout=$(max_recovery_time)
955 while [ $timer -lt $timeout ]; do
956 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
957 mdt.${MDT_DEV}.recovery_status |
958 awk '/^status/ { print \\\$2 }'")
959 [ "$STATUS" != "RECOVERING" ] && break;
964 [ $timer != $timeout ] ||
965 error "(14.1) recovery timeout"
967 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
968 [ "$STATUS" == "crashed" ] ||
969 error "(15) Expect 'crashed', but got '$STATUS'"
971 #define OBD_FAIL_LFSCK_DELAY2 0x1601
972 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
973 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
975 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
976 [ "$STATUS" == "scanning-phase1" ] ||
977 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
979 echo "stop $SINGLEMDS"
980 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
982 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
985 echo "start $SINGLEMDS"
986 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
987 error "(19) Fail to start MDS!"
990 while [ $timer -lt $timeout ]; do
991 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
992 mdt.${MDT_DEV}.recovery_status |
993 awk '/^status/ { print \\\$2 }'")
994 [ "$STATUS" != "RECOVERING" ] && break;
999 [ $timer != $timeout ] ||
1000 error "(19.1) recovery timeout"
1002 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1003 [ "$STATUS" == "paused" ] ||
1004 error "(20) Expect 'paused', but got '$STATUS'"
1006 echo "stop $SINGLEMDS"
1007 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1009 echo "start $SINGLEMDS without resume LFSCK"
1010 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1011 error "(20.2) Fail to start MDS!"
1014 while [ $timer -lt $timeout ]; do
1015 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1016 mdt.${MDT_DEV}.recovery_status |
1017 awk '/^status/ { print \\\$2 }'")
1018 [ "$STATUS" != "RECOVERING" ] && break;
1020 timer=$((timer + 1))
1023 [ $timer != $timeout ] ||
1024 error "(20.3) recovery timeout"
1026 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1027 [ "$STATUS" == "paused" ] ||
1028 error "(20.4) Expect 'paused', but got '$STATUS'"
1030 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1031 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1033 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1034 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1035 mdd.${MDT_DEV}.lfsck_namespace |
1036 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1038 error "(22) unexpected status"
1041 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1042 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1043 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1045 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1046 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1047 mdd.${MDT_DEV}.lfsck_namespace |
1048 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1050 error "(24) unexpected status"
1053 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1054 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1056 run_test 8 "LFSCK state machine"
1059 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1060 skip "Testing on UP system, the speed may be inaccurate."
1064 check_mount_and_prep
1065 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1066 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1067 createmany -o $DIR/$tdir/lfsck/f 5000
1069 local BASE_SPEED1=100
1071 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1074 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1075 [ "$STATUS" == "scanning-phase1" ] ||
1076 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1078 local SPEED=$($SHOW_LAYOUT |
1079 awk '/^average_speed_phase1/ { print $2 }')
1081 # There may be time error, normally it should be less than 2 seconds.
1082 # We allow another 20% schedule error.
1084 # MAX_MARGIN = 1.3 = 13 / 10
1085 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1086 RUN_TIME1 * 13 / 10))
1087 [ $SPEED -lt $MAX_SPEED ] || {
1089 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1090 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1093 # adjust speed limit
1094 local BASE_SPEED2=300
1096 do_facet $SINGLEMDS \
1097 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1100 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1101 # MIN_MARGIN = 0.7 = 7 / 10
1102 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1103 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1104 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1105 [ $SPEED -gt $MIN_SPEED ] || {
1106 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1107 error_ignore LU-5624 \
1108 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1111 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1115 # MAX_MARGIN = 1.3 = 13 / 10
1116 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1117 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1118 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1119 [ $SPEED -lt $MAX_SPEED ] || {
1121 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1122 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1123 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1126 do_nodes $(comma_list $(mdts_nodes)) \
1127 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1128 do_nodes $(comma_list $(osts_nodes)) \
1129 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1131 wait_update_facet $SINGLEMDS \
1132 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1133 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1134 error "(7) Failed to get expected 'completed'"
1136 run_test 9a "LFSCK speed control (1)"
1139 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1140 skip "Testing on UP system, the speed may be inaccurate."
1146 echo "Preparing another 50 * 50 files (with error) at $(date)."
1147 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1148 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1149 createmany -d $DIR/$tdir/d 50
1150 createmany -m $DIR/$tdir/f 50
1151 for ((i = 0; i < 50; i++)); do
1152 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1155 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1157 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1158 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1159 mdd.${MDT_DEV}.lfsck_namespace |
1160 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1162 error "(5) unexpected status"
1165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1166 echo "Prepared at $(date)."
1168 local BASE_SPEED1=50
1170 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1173 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1174 [ "$STATUS" == "scanning-phase2" ] ||
1175 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1177 local SPEED=$($SHOW_NAMESPACE |
1178 awk '/^average_speed_phase2/ { print $2 }')
1179 # There may be time error, normally it should be less than 2 seconds.
1180 # We allow another 20% schedule error.
1182 # MAX_MARGIN = 1.3 = 13 / 10
1183 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1184 RUN_TIME1 * 13 / 10))
1185 [ $SPEED -lt $MAX_SPEED ] || {
1187 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1188 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1191 # adjust speed limit
1192 local BASE_SPEED2=150
1194 do_facet $SINGLEMDS \
1195 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1198 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1199 # MIN_MARGIN = 0.7 = 7 / 10
1200 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1201 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1202 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1203 [ $SPEED -gt $MIN_SPEED ] || {
1204 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1205 error_ignore LU-5624 \
1206 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1209 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1213 # MAX_MARGIN = 1.3 = 13 / 10
1214 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1215 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1216 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1217 [ $SPEED -lt $MAX_SPEED ] || {
1219 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1220 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1221 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1224 do_nodes $(comma_list $(mdts_nodes)) \
1225 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1226 do_nodes $(comma_list $(osts_nodes)) \
1227 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1229 mdd.${MDT_DEV}.lfsck_namespace |
1230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1232 error "(11) unexpected status"
1235 run_test 9b "LFSCK speed control (2)"
1239 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1240 skip "lookup(..)/linkea on ZFS issue" && return
1244 echo "Preparing more files with error at $(date)."
1245 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1246 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1248 for ((i = 0; i < 1000; i = $((i+2)))); do
1249 mkdir -p $DIR/$tdir/d${i}
1250 touch $DIR/$tdir/f${i}
1251 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1254 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1257 for ((i = 1; i < 1000; i = $((i+2)))); do
1258 mkdir -p $DIR/$tdir/d${i}
1259 touch $DIR/$tdir/f${i}
1260 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1264 echo "Prepared at $(date)."
1266 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1268 umount_client $MOUNT
1269 mount_client $MOUNT || error "(3) Fail to start client!"
1271 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1274 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1275 [ "$STATUS" == "scanning-phase1" ] ||
1276 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1278 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1280 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1282 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1284 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1286 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1288 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1290 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1292 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1293 error "(14) Fail to softlink!"
1295 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1296 [ "$STATUS" == "scanning-phase1" ] ||
1297 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1299 do_nodes $(comma_list $(mdts_nodes)) \
1300 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1301 do_nodes $(comma_list $(osts_nodes)) \
1302 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1303 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1304 mdd.${MDT_DEV}.lfsck_namespace |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(16) unexpected status"
1310 run_test 10 "System is available during LFSCK scanning"
1313 ost_remove_lastid() {
1316 local rcmd="do_facet ost${ost}"
1318 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1320 # step 1: local mount
1321 mount_fstype ost${ost} || return 1
1322 # step 2: remove the specified LAST_ID
1323 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1325 unmount_fstype ost${ost} || return 2
1329 check_mount_and_prep
1330 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1331 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1336 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1338 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1339 error "(2) Fail to start ost1"
1341 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1342 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1344 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1345 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1347 wait_update_facet ost1 "$LCTL get_param -n \
1348 obdfilter.${OST_DEV}.lfsck_layout |
1349 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1351 error "(5) unexpected status"
1354 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1356 wait_update_facet ost1 "$LCTL get_param -n \
1357 obdfilter.${OST_DEV}.lfsck_layout |
1358 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1360 error "(6) unexpected status"
1363 echo "the LAST_ID(s) should have been rebuilt"
1364 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1365 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1367 run_test 11a "LFSCK can rebuild lost last_id"
1370 check_mount_and_prep
1371 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1373 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1374 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1375 do_facet ost1 $LCTL set_param fail_loc=0x160d
1377 local count=$(precreated_ost_obj_count 0 0)
1379 createmany -o $DIR/$tdir/f $((count + 32))
1381 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1382 local seq=$(do_facet mds1 $LCTL get_param -n \
1383 osp.${proc_path}.prealloc_last_seq)
1384 local lastid1=$(do_facet ost1 "lctl get_param -n \
1385 obdfilter.${ost1_svc}.last_id" | grep $seq |
1386 awk -F: '{ print $2 }')
1388 umount_client $MOUNT
1389 stop ost1 || error "(1) Fail to stop ost1"
1391 #define OBD_FAIL_OST_ENOSPC 0x215
1392 do_facet ost1 $LCTL set_param fail_loc=0x215
1394 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1395 error "(2) Fail to start ost1"
1397 for ((i = 0; i < 60; i++)); do
1398 lastid2=$(do_facet ost1 "lctl get_param -n \
1399 obdfilter.${ost1_svc}.last_id" | grep $seq |
1400 awk -F: '{ print $2 }')
1401 [ ! -z $lastid2 ] && break;
1405 echo "the on-disk LAST_ID should be smaller than the expected one"
1406 [ $lastid1 -gt $lastid2 ] ||
1407 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1409 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1410 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1412 wait_update_facet ost1 "$LCTL get_param -n \
1413 obdfilter.${OST_DEV}.lfsck_layout |
1414 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1416 error "(6) unexpected status"
1419 stop ost1 || error "(7) Fail to stop ost1"
1421 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1422 error "(8) Fail to start ost1"
1424 echo "the on-disk LAST_ID should have been rebuilt"
1425 wait_update_facet ost1 "$LCTL get_param -n \
1426 obdfilter.${ost1_svc}.last_id | grep $seq |
1427 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1428 do_facet ost1 $LCTL get_param -n \
1429 obdfilter.${ost1_svc}.last_id
1430 error "(9) expect lastid1 $seq:$lastid1"
1433 do_facet ost1 $LCTL set_param fail_loc=0
1434 stopall || error "(10) Fail to stopall"
1436 run_test 11b "LFSCK can rebuild crashed last_id"
1439 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1441 check_mount_and_prep
1442 for k in $(seq $MDSCOUNT); do
1443 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1444 createmany -o $DIR/$tdir/${k}/f 100 ||
1445 error "(0) Fail to create 100 files."
1448 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1449 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1450 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1452 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1453 wait_all_targets namespace scanning-phase1 3
1455 echo "Stop namespace LFSCK on all targets by single lctl command."
1456 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1457 error "(4) Fail to stop LFSCK on all devices!"
1459 echo "All the LFSCK targets should be in 'stopped' status."
1460 wait_all_targets_blocked namespace stopped 5
1462 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1463 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1464 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1466 echo "All the LFSCK targets should be in 'completed' status."
1467 wait_all_targets_blocked namespace completed 7
1469 start_full_debug_logging
1471 echo "Start layout LFSCK on all targets by single command (-s 1)."
1472 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1473 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1475 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1476 wait_all_targets layout scanning-phase1 9
1478 echo "Stop layout LFSCK on all targets by single lctl command."
1479 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1480 error "(10) Fail to stop LFSCK on all devices!"
1482 echo "All the LFSCK targets should be in 'stopped' status."
1483 wait_all_targets_blocked layout stopped 11
1485 for k in $(seq $OSTCOUNT); do
1486 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1487 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1488 awk '/^status/ { print $2 }')
1489 [ "$STATUS" == "stopped" ] ||
1490 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1493 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1494 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1495 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1497 echo "All the LFSCK targets should be in 'completed' status."
1498 wait_all_targets_blocked layout completed 14
1500 stop_full_debug_logging
1502 run_test 12a "single command to trigger LFSCK on all devices"
1505 check_mount_and_prep
1507 echo "Start LFSCK without '-M' specified."
1508 do_facet mds1 $LCTL lfsck_start -A -r ||
1509 error "(0) Fail to start LFSCK without '-M'"
1511 wait_all_targets_blocked namespace completed 1
1512 wait_all_targets_blocked layout completed 2
1514 local count=$(do_facet mds1 $LCTL dl |
1515 awk '{ print $3 }' | grep mdt | wc -l)
1516 if [ $count -gt 1 ]; then
1518 echo "Start layout LFSCK on the node with multipe targets,"
1519 echo "but not specify '-M'/'-A' option. Should get failure."
1521 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1522 error "(3) Start layout LFSCK should fail" || true
1525 run_test 12b "auto detect Lustre device"
1529 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1530 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1531 echo "MDT-object FID."
1534 check_mount_and_prep
1536 echo "Inject failure stub to simulate bad lmm_oi"
1537 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1538 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1539 createmany -o $DIR/$tdir/f 1
1540 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1541 error "(0) Fail to create PFL $DIR/$tdir/f1"
1542 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1544 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1545 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1548 mdd.${MDT_DEV}.lfsck_layout |
1549 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1551 error "(2) unexpected status"
1554 local repaired=$($SHOW_LAYOUT |
1555 awk '/^repaired_others/ { print $2 }')
1556 [ $repaired -eq 2 ] ||
1557 error "(3) Fail to repair crashed lmm_oi: $repaired"
1559 run_test 13 "LFSCK can repair crashed lmm_oi"
1563 echo "The OST-object referenced by the MDT-object should be there;"
1564 echo "otherwise, the LFSCK should re-create the missing OST-object."
1565 echo "without '--delay-create-ostobj' option."
1568 check_mount_and_prep
1569 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1571 echo "Inject failure stub to simulate dangling referenced MDT-object"
1572 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1573 do_facet ost1 $LCTL set_param fail_loc=0x1610
1574 local count=$(precreated_ost_obj_count 0 0)
1576 createmany -o $DIR/$tdir/f $((count + 16)) ||
1577 error "(0.1) Fail to create $DIR/$tdir/fx"
1578 touch $DIR/$tdir/guard0
1580 for ((i = 0; i < 16; i++)); do
1581 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1582 $DIR/$tdir/f_comp${i} ||
1583 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1585 touch $DIR/$tdir/guard1
1587 do_facet ost1 $LCTL set_param fail_loc=0
1589 start_full_debug_logging
1591 # exhaust other pre-created dangling cases
1592 count=$(precreated_ost_obj_count 0 0)
1593 createmany -o $DIR/$tdir/a $count ||
1594 error "(0.5) Fail to create $count files."
1596 echo "'ls' should fail because of dangling referenced MDT-object"
1597 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1599 echo "Trigger layout LFSCK to find out dangling reference"
1600 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1602 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1603 mdd.${MDT_DEV}.lfsck_layout |
1604 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1606 error "(3) unexpected status"
1609 local repaired=$($SHOW_LAYOUT |
1610 awk '/^repaired_dangling/ { print $2 }')
1611 [ $repaired -ge 32 ] ||
1612 error "(4) Fail to repair dangling reference: $repaired"
1614 echo "'stat' should fail because of not repair dangling by default"
1615 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1616 error "(5.1) stat should fail"
1617 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1618 error "(5.2) stat should fail"
1620 echo "Trigger layout LFSCK to repair dangling reference"
1621 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1623 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1624 mdd.${MDT_DEV}.lfsck_layout |
1625 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1627 error "(7) unexpected status"
1630 # There may be some async LFSCK updates in processing, wait for
1631 # a while until the target reparation has been done. LU-4970.
1633 echo "'stat' should success after layout LFSCK repairing"
1634 wait_update_facet client "stat $DIR/$tdir/guard0 |
1635 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1636 stat $DIR/$tdir/guard0
1638 error "(8.1) unexpected size"
1641 wait_update_facet client "stat $DIR/$tdir/guard1 |
1642 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1643 stat $DIR/$tdir/guard1
1645 error "(8.2) unexpected size"
1648 repaired=$($SHOW_LAYOUT |
1649 awk '/^repaired_dangling/ { print $2 }')
1650 [ $repaired -ge 32 ] ||
1651 error "(9) Fail to repair dangling reference: $repaired"
1653 stop_full_debug_logging
1655 echo "stopall to cleanup object cache"
1658 setupall > /dev/null
1660 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1664 echo "The OST-object referenced by the MDT-object should be there;"
1665 echo "otherwise, the LFSCK should re-create the missing OST-object."
1666 echo "with '--delay-create-ostobj' option."
1669 check_mount_and_prep
1670 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1672 echo "Inject failure stub to simulate dangling referenced MDT-object"
1673 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1674 do_facet ost1 $LCTL set_param fail_loc=0x1610
1675 local count=$(precreated_ost_obj_count 0 0)
1677 createmany -o $DIR/$tdir/f $((count + 31))
1678 touch $DIR/$tdir/guard
1679 do_facet ost1 $LCTL set_param fail_loc=0
1681 start_full_debug_logging
1683 # exhaust other pre-created dangling cases
1684 count=$(precreated_ost_obj_count 0 0)
1685 createmany -o $DIR/$tdir/a $count ||
1686 error "(0) Fail to create $count files."
1688 echo "'ls' should fail because of dangling referenced MDT-object"
1689 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1691 echo "Trigger layout LFSCK to find out dangling reference"
1692 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1694 wait_all_targets_blocked layout completed 3
1696 local repaired=$($SHOW_LAYOUT |
1697 awk '/^repaired_dangling/ { print $2 }')
1698 [ $repaired -ge 32 ] ||
1699 error "(4) Fail to repair dangling reference: $repaired"
1701 echo "'stat' should fail because of not repair dangling by default"
1702 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1704 echo "Trigger layout LFSCK to repair dangling reference"
1705 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1707 wait_all_targets_blocked layout completed 7
1709 # There may be some async LFSCK updates in processing, wait for
1710 # a while until the target reparation has been done. LU-4970.
1712 echo "'stat' should success after layout LFSCK repairing"
1713 wait_update_facet client "stat $DIR/$tdir/guard |
1714 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1715 stat $DIR/$tdir/guard
1717 error "(8) unexpected size"
1720 repaired=$($SHOW_LAYOUT |
1721 awk '/^repaired_dangling/ { print $2 }')
1722 [ $repaired -ge 32 ] ||
1723 error "(9) Fail to repair dangling reference: $repaired"
1725 stop_full_debug_logging
1727 echo "stopall to cleanup object cache"
1730 setupall > /dev/null
1732 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1736 echo "If the OST-object referenced by the MDT-object back points"
1737 echo "to some non-exist MDT-object, then the LFSCK should repair"
1738 echo "the OST-object to back point to the right MDT-object."
1741 check_mount_and_prep
1742 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1744 echo "Inject failure stub to make the OST-object to back point to"
1745 echo "non-exist MDT-object."
1746 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1748 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1749 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1750 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1752 error "(0) Fail to create PFL $DIR/$tdir/f1"
1753 # 'dd' will trigger punch RPC firstly on every OST-objects.
1754 # So even though some OST-object will not be write by 'dd',
1755 # as long as it is allocated (may be NOT allocated in pfl_3b)
1756 # its layout information will be set also.
1757 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1758 cancel_lru_locks osc
1759 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1761 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1762 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1764 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1765 mdd.${MDT_DEV}.lfsck_layout |
1766 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1768 error "(2) unexpected status"
1771 local repaired=$($SHOW_LAYOUT |
1772 awk '/^repaired_unmatched_pair/ { print $2 }')
1773 [ $repaired -ge 3 ] ||
1774 error "(3) Fail to repair unmatched pair: $repaired"
1776 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1780 echo "If the OST-object referenced by the MDT-object back points"
1781 echo "to other MDT-object that doesn't recognize the OST-object,"
1782 echo "then the LFSCK should repair it to back point to the right"
1783 echo "MDT-object (the first one)."
1786 check_mount_and_prep
1787 mkdir -p $DIR/$tdir/0
1788 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1789 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1790 cancel_lru_locks osc
1792 echo "Inject failure stub to make the OST-object to back point to"
1793 echo "other MDT-object"
1796 [ $OSTCOUNT -ge 2 ] && stripes=2
1798 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1799 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1800 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1801 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1803 error "(0) Fail to create PFL $DIR/$tdir/f1"
1804 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1805 cancel_lru_locks osc
1806 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1808 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1809 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1811 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1812 mdd.${MDT_DEV}.lfsck_layout |
1813 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1815 error "(2) unexpected status"
1818 local repaired=$($SHOW_LAYOUT |
1819 awk '/^repaired_unmatched_pair/ { print $2 }')
1820 [ $repaired -eq 4 ] ||
1821 error "(3) Fail to repair unmatched pair: $repaired"
1823 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1826 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1828 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1829 skip "Skip the test after 2.7.55 see LU-6437" && return
1832 echo "According to current metadata migration implementation,"
1833 echo "before the old MDT-object is removed, both the new MDT-object"
1834 echo "and old MDT-object will reference the same LOV layout. Then if"
1835 echo "the layout LFSCK finds the new MDT-object by race, it will"
1836 echo "regard related OST-object(s) as multiple referenced case, and"
1837 echo "will try to create new OST-object(s) for the new MDT-object."
1838 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1839 echo "MDT-object before confirm the multiple referenced case."
1842 check_mount_and_prep
1843 $LFS mkdir -i 1 $DIR/$tdir/a1
1844 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1845 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1846 cancel_lru_locks osc
1848 echo "Inject failure stub on MDT1 to delay the migration"
1850 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1851 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1852 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1853 $LFS migrate -m 0 $DIR/$tdir/a1 &
1856 echo "Trigger layout LFSCK to race with the migration"
1857 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1859 wait_all_targets_blocked layout completed 2
1861 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1862 local repaired=$($SHOW_LAYOUT |
1863 awk '/^repaired_unmatched_pair/ { print $2 }')
1864 [ $repaired -eq 1 ] ||
1865 error "(3) Fail to repair unmatched pair: $repaired"
1867 repaired=$($SHOW_LAYOUT |
1868 awk '/^repaired_multiple_referenced/ { print $2 }')
1869 [ $repaired -eq 0 ] ||
1870 error "(4) Unexpectedly repaird multiple references: $repaired"
1872 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1876 echo "If the OST-object's owner information does not match the owner"
1877 echo "information stored in the MDT-object, then the LFSCK trust the"
1878 echo "MDT-object and update the OST-object's owner information."
1881 check_mount_and_prep
1882 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1883 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1884 cancel_lru_locks osc
1886 # created but no setattr or write to the file.
1888 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1889 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1891 echo "Inject failure stub to skip OST-object owner changing"
1892 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1894 chown 1.1 $DIR/$tdir/f0
1895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1897 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1900 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1902 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1903 mdd.${MDT_DEV}.lfsck_layout |
1904 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1906 error "(2) unexpected status"
1909 local repaired=$($SHOW_LAYOUT |
1910 awk '/^repaired_inconsistent_owner/ { print $2 }')
1911 [ $repaired -eq 1 ] ||
1912 error "(3) Fail to repair inconsistent owner: $repaired"
1914 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1918 echo "If more than one MDT-objects reference the same OST-object,"
1919 echo "and the OST-object only recognizes one MDT-object, then the"
1920 echo "LFSCK should create new OST-objects for such non-recognized"
1924 check_mount_and_prep
1925 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1927 echo "Inject failure stub to make two MDT-objects to refernce"
1928 echo "the OST-object"
1930 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1931 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1932 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1933 cancel_lru_locks mdc
1934 cancel_lru_locks osc
1936 createmany -o $DIR/$tdir/f 1
1937 cancel_lru_locks mdc
1938 cancel_lru_locks osc
1940 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1942 error "(0) Fail to create PFL $DIR/$tdir/f1"
1943 cancel_lru_locks mdc
1944 cancel_lru_locks osc
1945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1947 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1948 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1949 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1950 [ $size -eq 1048576 ] ||
1951 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1953 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1954 [ $size -eq 1048576 ] ||
1955 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1957 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1960 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1962 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1963 mdd.${MDT_DEV}.lfsck_layout |
1964 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1966 error "(3) unexpected status"
1969 local repaired=$($SHOW_LAYOUT |
1970 awk '/^repaired_multiple_referenced/ { print $2 }')
1971 [ $repaired -eq 2 ] ||
1972 error "(4) Fail to repair multiple references: $repaired"
1974 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1975 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1976 error "(5) Fail to write f0."
1977 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1978 [ $size -eq 1048576 ] ||
1979 error "(6) guard size should be 1048576, but got $size"
1981 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1982 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1983 error "(7) Fail to write f1."
1984 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1985 [ $size -eq 1048576 ] ||
1986 error "(8) guard size should be 1048576, but got $size"
1988 run_test 17 "LFSCK can repair multiple references"
1990 $LCTL set_param debug=+cache > /dev/null
1994 echo "The target MDT-object is there, but related stripe information"
1995 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1996 echo "layout EA entries."
1999 check_mount_and_prep
2000 $LFS mkdir -i 0 $DIR/$tdir/a1
2001 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2002 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2004 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2006 $LFS path2fid $DIR/$tdir/a1/f1
2007 $LFS getstripe $DIR/$tdir/a1/f1
2009 if [ $MDSCOUNT -ge 2 ]; then
2010 $LFS mkdir -i 1 $DIR/$tdir/a2
2011 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2012 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2013 $LFS path2fid $DIR/$tdir/a2/f2
2014 $LFS getstripe $DIR/$tdir/a2/f2
2017 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2018 error "(0) Fail to create PFL $DIR/$tdir/f3"
2020 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2022 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2024 $LFS path2fid $DIR/$tdir/f3
2025 $LFS getstripe $DIR/$tdir/f3
2027 cancel_lru_locks osc
2029 echo "Inject failure, to make the MDT-object lost its layout EA"
2030 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2031 do_facet mds1 $LCTL set_param fail_loc=0x1615
2032 chown 1.1 $DIR/$tdir/a1/f1
2034 if [ $MDSCOUNT -ge 2 ]; then
2035 do_facet mds2 $LCTL set_param fail_loc=0x1615
2036 chown 1.1 $DIR/$tdir/a2/f2
2039 chown 1.1 $DIR/$tdir/f3
2044 do_facet mds1 $LCTL set_param fail_loc=0
2045 if [ $MDSCOUNT -ge 2 ]; then
2046 do_facet mds2 $LCTL set_param fail_loc=0
2049 cancel_lru_locks mdc
2050 cancel_lru_locks osc
2052 echo "The file size should be incorrect since layout EA is lost"
2053 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2054 [ "$cur_size" != "$saved_size1" ] ||
2055 error "(1) Expect incorrect file1 size"
2057 if [ $MDSCOUNT -ge 2 ]; then
2058 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2059 [ "$cur_size" != "$saved_size1" ] ||
2060 error "(2) Expect incorrect file2 size"
2063 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2064 [ "$cur_size" != "$saved_size2" ] ||
2065 error "(1.2) Expect incorrect file3 size"
2067 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2068 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2070 for k in $(seq $MDSCOUNT); do
2071 # The LFSCK status query internal is 30 seconds. For the case
2072 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2073 # time to guarantee the status sync up.
2074 wait_update_facet mds${k} "$LCTL get_param -n \
2075 mdd.$(facet_svc mds${k}).lfsck_layout |
2076 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2077 error "(4) MDS${k} is not the expected 'completed'"
2080 for k in $(seq $OSTCOUNT); do
2081 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2082 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2083 awk '/^status/ { print $2 }')
2084 [ "$cur_status" == "completed" ] ||
2085 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2088 local repaired=$(do_facet mds1 $LCTL get_param -n \
2089 mdd.$(facet_svc mds1).lfsck_layout |
2090 awk '/^repaired_orphan/ { print $2 }')
2091 [ $repaired -eq 3 ] ||
2092 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2094 if [ $MDSCOUNT -ge 2 ]; then
2095 repaired=$(do_facet mds2 $LCTL get_param -n \
2096 mdd.$(facet_svc mds2).lfsck_layout |
2097 awk '/^repaired_orphan/ { print $2 }')
2098 [ $repaired -eq 2 ] ||
2099 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2102 $LFS path2fid $DIR/$tdir/a1/f1
2103 $LFS getstripe $DIR/$tdir/a1/f1
2105 if [ $MDSCOUNT -ge 2 ]; then
2106 $LFS path2fid $DIR/$tdir/a2/f2
2107 $LFS getstripe $DIR/$tdir/a2/f2
2110 $LFS path2fid $DIR/$tdir/f3
2111 $LFS getstripe $DIR/$tdir/f3
2113 echo "The file size should be correct after layout LFSCK scanning"
2114 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2115 [ "$cur_size" == "$saved_size1" ] ||
2116 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2118 if [ $MDSCOUNT -ge 2 ]; then
2119 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2120 [ "$cur_size" == "$saved_size1" ] ||
2121 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2124 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2125 [ "$cur_size" == "$saved_size2" ] ||
2126 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2128 run_test 18a "Find out orphan OST-object and repair it (1)"
2131 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2134 echo "The target MDT-object is lost. The LFSCK should re-create the"
2135 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2136 echo "can move it back to normal namespace manually."
2139 check_mount_and_prep
2140 $LFS mkdir -i 0 $DIR/$tdir/a1
2141 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2142 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2143 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2144 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2146 $LFS getstripe $DIR/$tdir/a1/f1
2148 if [ $MDSCOUNT -ge 2 ]; then
2149 $LFS mkdir -i 1 $DIR/$tdir/a2
2150 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2151 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2152 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2154 $LFS getstripe $DIR/$tdir/a2/f2
2157 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2158 error "(0) Fail to create PFL $DIR/$tdir/f3"
2160 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2162 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2163 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2165 $LFS getstripe $DIR/$tdir/f3
2167 cancel_lru_locks osc
2169 echo "Inject failure, to simulate the case of missing the MDT-object"
2170 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2171 do_facet mds1 $LCTL set_param fail_loc=0x1616
2172 rm -f $DIR/$tdir/a1/f1
2174 if [ $MDSCOUNT -ge 2 ]; then
2175 do_facet mds2 $LCTL set_param fail_loc=0x1616
2176 rm -f $DIR/$tdir/a2/f2
2184 do_facet mds1 $LCTL set_param fail_loc=0
2185 if [ $MDSCOUNT -ge 2 ]; then
2186 do_facet mds2 $LCTL set_param fail_loc=0
2189 cancel_lru_locks mdc
2190 cancel_lru_locks osc
2192 # dryrun mode only check orphans, not repaie
2193 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2194 $START_LAYOUT --dryrun -o -r ||
2195 error "Fail to start layout LFSCK in dryrun mode"
2196 wait_all_targets_blocked layout completed 2
2198 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2199 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2200 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2202 local orphans=$(do_facet mds1 $LCTL get_param -n \
2203 mdd.$(facet_svc mds1).lfsck_layout |
2204 awk '/^inconsistent_orphan/ { print $2 }')
2205 [ $orphans -eq 3 ] ||
2206 error "Expect 3 found on mds1, but got: $orphans"
2208 # orphan parents should not be created
2210 for subdir in $MOUNT/.lustre/lost+found/*; do
2211 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2214 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2215 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2217 for k in $(seq $MDSCOUNT); do
2218 # The LFSCK status query internal is 30 seconds. For the case
2219 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2220 # time to guarantee the status sync up.
2221 wait_update_facet mds${k} "$LCTL get_param -n \
2222 mdd.$(facet_svc mds${k}).lfsck_layout |
2223 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2224 error "(2) MDS${k} is not the expected 'completed'"
2227 for k in $(seq $OSTCOUNT); do
2228 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2229 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2230 awk '/^status/ { print $2 }')
2231 [ "$cur_status" == "completed" ] ||
2232 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2235 local repaired=$(do_facet mds1 $LCTL get_param -n \
2236 mdd.$(facet_svc mds1).lfsck_layout |
2237 awk '/^repaired_orphan/ { print $2 }')
2238 [ $repaired -eq 3 ] ||
2239 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2241 if [ $MDSCOUNT -ge 2 ]; then
2242 repaired=$(do_facet mds2 $LCTL get_param -n \
2243 mdd.$(facet_svc mds2).lfsck_layout |
2244 awk '/^repaired_orphan/ { print $2 }')
2245 [ $repaired -eq 2 ] ||
2246 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2249 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2250 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2251 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2253 if [ $MDSCOUNT -ge 2 ]; then
2254 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2255 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2258 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2259 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2261 $LFS path2fid $DIR/$tdir/a1/f1
2262 $LFS getstripe $DIR/$tdir/a1/f1
2264 if [ $MDSCOUNT -ge 2 ]; then
2265 $LFS path2fid $DIR/$tdir/a2/f2
2266 $LFS getstripe $DIR/$tdir/a2/f2
2269 $LFS path2fid $DIR/$tdir/f3
2270 $LFS getstripe $DIR/$tdir/f3
2272 echo "The file size should be correct after layout LFSCK scanning"
2273 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2274 [ "$cur_size" == "$saved_size1" ] ||
2275 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2277 if [ $MDSCOUNT -ge 2 ]; then
2278 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2279 [ "$cur_size" == "$saved_size1" ] ||
2280 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2283 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2284 [ "$cur_size" == "$saved_size2" ] ||
2285 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2287 run_test 18b "Find out orphan OST-object and repair it (2)"
2290 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2293 echo "The target MDT-object is lost, and the OST-object FID is missing."
2294 echo "The LFSCK should re-create the MDT-object with new FID under the "
2295 echo "directory .lustre/lost+found/MDTxxxx."
2298 check_mount_and_prep
2299 $LFS mkdir -i 0 $DIR/$tdir/a1
2300 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2302 echo "Inject failure, to simulate the case of missing parent FID"
2303 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2304 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2306 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2307 $LFS getstripe $DIR/$tdir/a1/f1
2309 if [ $MDSCOUNT -ge 2 ]; then
2310 $LFS mkdir -i 1 $DIR/$tdir/a2
2311 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2312 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2313 $LFS getstripe $DIR/$tdir/a2/f2
2316 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2317 error "(0) Fail to create PFL $DIR/$tdir/f3"
2319 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2320 $LFS getstripe $DIR/$tdir/f3
2322 cancel_lru_locks osc
2323 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2325 echo "Inject failure, to simulate the case of missing the MDT-object"
2326 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2327 do_facet mds1 $LCTL set_param fail_loc=0x1616
2328 rm -f $DIR/$tdir/a1/f1
2330 if [ $MDSCOUNT -ge 2 ]; then
2331 do_facet mds2 $LCTL set_param fail_loc=0x1616
2332 rm -f $DIR/$tdir/a2/f2
2340 do_facet mds1 $LCTL set_param fail_loc=0
2341 if [ $MDSCOUNT -ge 2 ]; then
2342 do_facet mds2 $LCTL set_param fail_loc=0
2345 cancel_lru_locks mdc
2346 cancel_lru_locks osc
2348 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2349 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2351 for k in $(seq $MDSCOUNT); do
2352 # The LFSCK status query internal is 30 seconds. For the case
2353 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2354 # time to guarantee the status sync up.
2355 wait_update_facet mds${k} "$LCTL get_param -n \
2356 mdd.$(facet_svc mds${k}).lfsck_layout |
2357 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2358 error "(2) MDS${k} is not the expected 'completed'"
2361 for k in $(seq $OSTCOUNT); do
2362 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2363 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2364 awk '/^status/ { print $2 }')
2365 [ "$cur_status" == "completed" ] ||
2366 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2369 if [ $MDSCOUNT -ge 2 ]; then
2375 local repaired=$(do_facet mds1 $LCTL get_param -n \
2376 mdd.$(facet_svc mds1).lfsck_layout |
2377 awk '/^repaired_orphan/ { print $2 }')
2378 [ $repaired -eq $expected ] ||
2379 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2381 if [ $MDSCOUNT -ge 2 ]; then
2382 repaired=$(do_facet mds2 $LCTL get_param -n \
2383 mdd.$(facet_svc mds2).lfsck_layout |
2384 awk '/^repaired_orphan/ { print $2 }')
2385 [ $repaired -eq 0 ] ||
2386 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2389 ls -ail $MOUNT/.lustre/lost+found/
2391 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2392 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2393 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2395 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2398 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2399 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2400 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2402 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2403 [ ! -z "$cname" ] ||
2404 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2406 run_test 18c "Find out orphan OST-object and repair it (3)"
2410 echo "The target MDT-object layout EA is corrupted, but the right"
2411 echo "OST-object is still alive as orphan. The layout LFSCK will"
2412 echo "not create new OST-object to occupy such slot."
2415 check_mount_and_prep
2417 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2418 echo "guard" > $DIR/$tdir/a1/f1
2419 echo "foo" > $DIR/$tdir/a1/f2
2421 echo "guard" > $DIR/$tdir/a1/f3
2422 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2423 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2424 echo "foo" > $DIR/$tdir/a1/f4
2426 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2427 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2428 $LFS path2fid $DIR/$tdir/a1/f1
2429 $LFS getstripe $DIR/$tdir/a1/f1
2430 $LFS path2fid $DIR/$tdir/a1/f2
2431 $LFS getstripe $DIR/$tdir/a1/f2
2432 $LFS path2fid $DIR/$tdir/a1/f3
2433 $LFS getstripe $DIR/$tdir/a1/f3
2434 $LFS path2fid $DIR/$tdir/a1/f4
2435 $LFS getstripe $DIR/$tdir/a1/f4
2436 cancel_lru_locks osc
2438 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2439 echo "to reference the same OST-object (which is f1's OST-obejct)."
2440 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2441 echo "dangling reference case, but f2's old OST-object is there."
2443 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2444 echo "to reference the same OST-object (which is f3's OST-obejct)."
2445 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2446 echo "dangling reference case, but f4's old OST-object is there."
2449 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2451 chown 1.1 $DIR/$tdir/a1/f2
2452 chown 1.1 $DIR/$tdir/a1/f4
2453 rm -f $DIR/$tdir/a1/f1
2454 rm -f $DIR/$tdir/a1/f3
2457 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2459 echo "stopall to cleanup object cache"
2462 setupall > /dev/null
2464 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2465 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2467 for k in $(seq $MDSCOUNT); do
2468 # The LFSCK status query internal is 30 seconds. For the case
2469 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2470 # time to guarantee the status sync up.
2471 wait_update_facet mds${k} "$LCTL get_param -n \
2472 mdd.$(facet_svc mds${k}).lfsck_layout |
2473 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2474 error "(3) MDS${k} is not the expected 'completed'"
2477 for k in $(seq $OSTCOUNT); do
2478 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2479 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2480 awk '/^status/ { print $2 }')
2481 [ "$cur_status" == "completed" ] ||
2482 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2485 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2486 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2487 awk '/^repaired_orphan/ { print $2 }')
2488 [ $repaired -eq 2 ] ||
2489 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2491 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2492 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2493 awk '/^repaired_dangling/ { print $2 }')
2494 [ $repaired -eq 0 ] ||
2495 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2497 echo "The file size should be correct after layout LFSCK scanning"
2498 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2499 [ "$cur_size" == "$saved_size1" ] ||
2500 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2502 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2503 [ "$cur_size" == "$saved_size2" ] ||
2504 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2506 echo "The LFSCK should find back the original data."
2507 cat $DIR/$tdir/a1/f2
2508 $LFS path2fid $DIR/$tdir/a1/f2
2509 $LFS getstripe $DIR/$tdir/a1/f2
2510 cat $DIR/$tdir/a1/f4
2511 $LFS path2fid $DIR/$tdir/a1/f4
2512 $LFS getstripe $DIR/$tdir/a1/f4
2514 run_test 18d "Find out orphan OST-object and repair it (4)"
2517 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2520 echo "The target MDT-object layout EA slot is occpuied by some new"
2521 echo "created OST-object when repair dangling reference case. Such"
2522 echo "conflict OST-object has been modified by others. To keep the"
2523 echo "new data, the LFSCK will create a new file to refernece this"
2524 echo "old orphan OST-object."
2527 check_mount_and_prep
2529 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2530 echo "guard" > $DIR/$tdir/a1/f1
2531 echo "foo" > $DIR/$tdir/a1/f2
2533 echo "guard" > $DIR/$tdir/a1/f3
2534 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2535 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2536 echo "foo" > $DIR/$tdir/a1/f4
2538 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2539 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2541 $LFS path2fid $DIR/$tdir/a1/f1
2542 $LFS getstripe $DIR/$tdir/a1/f1
2543 $LFS path2fid $DIR/$tdir/a1/f2
2544 $LFS getstripe $DIR/$tdir/a1/f2
2545 $LFS path2fid $DIR/$tdir/a1/f3
2546 $LFS getstripe $DIR/$tdir/a1/f3
2547 $LFS path2fid $DIR/$tdir/a1/f4
2548 $LFS getstripe $DIR/$tdir/a1/f4
2549 cancel_lru_locks osc
2551 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2552 echo "to reference the same OST-object (which is f1's OST-obejct)."
2553 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2554 echo "dangling reference case, but f2's old OST-object is there."
2556 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2557 echo "to reference the same OST-object (which is f3's OST-obejct)."
2558 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2559 echo "dangling reference case, but f4's old OST-object is there."
2562 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2564 chown 1.1 $DIR/$tdir/a1/f2
2565 chown 1.1 $DIR/$tdir/a1/f4
2566 rm -f $DIR/$tdir/a1/f1
2567 rm -f $DIR/$tdir/a1/f3
2570 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2572 echo "stopall to cleanup object cache"
2575 setupall > /dev/null
2577 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2578 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2580 start_full_debug_logging
2582 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2583 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2585 wait_update_facet mds1 "$LCTL get_param -n \
2586 mdd.$(facet_svc mds1).lfsck_layout |
2587 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2588 error "(3) MDS1 is not the expected 'scanning-phase2'"
2590 # to guarantee all updates are synced.
2594 echo "Write new data to f2/f4 to modify the new created OST-object."
2595 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2596 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2598 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2600 for k in $(seq $MDSCOUNT); do
2601 # The LFSCK status query internal is 30 seconds. For the case
2602 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2603 # time to guarantee the status sync up.
2604 wait_update_facet mds${k} "$LCTL get_param -n \
2605 mdd.$(facet_svc mds${k}).lfsck_layout |
2606 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2607 error "(4) MDS${k} is not the expected 'completed'"
2610 for k in $(seq $OSTCOUNT); do
2611 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2612 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2613 awk '/^status/ { print $2 }')
2614 [ "$cur_status" == "completed" ] ||
2615 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2618 stop_full_debug_logging
2620 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2621 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2622 awk '/^repaired_orphan/ { print $2 }')
2623 [ $repaired -eq 2 ] ||
2624 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2626 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2627 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2628 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2630 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2631 if [ $count -ne 2 ]; then
2632 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2633 error "(8) Expect 2 stubs under lost+found, but got $count"
2636 echo "The stub file should keep the original f2 or f4 data"
2637 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2638 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2639 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2640 error "(9) Got unexpected $cur_size"
2643 $LFS path2fid $cname
2644 $LFS getstripe $cname
2646 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2647 cur_size=$(ls -il $cname | awk '{ print $6 }')
2648 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2649 error "(10) Got unexpected $cur_size"
2652 $LFS path2fid $cname
2653 $LFS getstripe $cname
2655 echo "The f2/f4 should contains new data."
2656 cat $DIR/$tdir/a1/f2
2657 $LFS path2fid $DIR/$tdir/a1/f2
2658 $LFS getstripe $DIR/$tdir/a1/f2
2659 cat $DIR/$tdir/a1/f4
2660 $LFS path2fid $DIR/$tdir/a1/f4
2661 $LFS getstripe $DIR/$tdir/a1/f4
2663 run_test 18e "Find out orphan OST-object and repair it (5)"
2666 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2669 echo "The target MDT-object is lost. The LFSCK should re-create the"
2670 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2671 echo "to verify some OST-object(s) during the first stage-scanning,"
2672 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2673 echo "should not be affected."
2676 check_mount_and_prep
2677 $LFS mkdir -i 0 $DIR/$tdir/a1
2678 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2679 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2680 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2681 $LFS mkdir -i 0 $DIR/$tdir/a2
2682 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2683 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2684 $LFS getstripe $DIR/$tdir/a1/f1
2685 $LFS getstripe $DIR/$tdir/a2/f2
2687 if [ $MDSCOUNT -ge 2 ]; then
2688 $LFS mkdir -i 1 $DIR/$tdir/a3
2689 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2690 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2691 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2692 $LFS mkdir -i 1 $DIR/$tdir/a4
2693 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2694 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2695 $LFS getstripe $DIR/$tdir/a3/f3
2696 $LFS getstripe $DIR/$tdir/a4/f4
2699 cancel_lru_locks osc
2701 echo "Inject failure, to simulate the case of missing the MDT-object"
2702 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2703 do_facet mds1 $LCTL set_param fail_loc=0x1616
2704 rm -f $DIR/$tdir/a1/f1
2705 rm -f $DIR/$tdir/a2/f2
2707 if [ $MDSCOUNT -ge 2 ]; then
2708 do_facet mds2 $LCTL set_param fail_loc=0x1616
2709 rm -f $DIR/$tdir/a3/f3
2710 rm -f $DIR/$tdir/a4/f4
2716 do_facet mds1 $LCTL set_param fail_loc=0
2717 if [ $MDSCOUNT -ge 2 ]; then
2718 do_facet mds2 $LCTL set_param fail_loc=0
2721 cancel_lru_locks mdc
2722 cancel_lru_locks osc
2724 echo "Inject failure, to simulate the OST0 fail to handle"
2725 echo "MDT0 LFSCK request during the first-stage scanning."
2726 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2727 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2729 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2730 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2732 for k in $(seq $MDSCOUNT); do
2733 # The LFSCK status query internal is 30 seconds. For the case
2734 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2735 # time to guarantee the status sync up.
2736 wait_update_facet mds${k} "$LCTL get_param -n \
2737 mdd.$(facet_svc mds${k}).lfsck_layout |
2738 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2739 error "(2) MDS${k} is not the expected 'partial'"
2742 wait_update_facet ost1 "$LCTL get_param -n \
2743 obdfilter.$(facet_svc ost1).lfsck_layout |
2744 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2745 error "(3) OST1 is not the expected 'partial'"
2748 wait_update_facet ost2 "$LCTL get_param -n \
2749 obdfilter.$(facet_svc ost2).lfsck_layout |
2750 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2751 error "(4) OST2 is not the expected 'completed'"
2754 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2756 local repaired=$(do_facet mds1 $LCTL get_param -n \
2757 mdd.$(facet_svc mds1).lfsck_layout |
2758 awk '/^repaired_orphan/ { print $2 }')
2759 [ $repaired -eq 1 ] ||
2760 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2762 if [ $MDSCOUNT -ge 2 ]; then
2763 repaired=$(do_facet mds2 $LCTL get_param -n \
2764 mdd.$(facet_svc mds2).lfsck_layout |
2765 awk '/^repaired_orphan/ { print $2 }')
2766 [ $repaired -eq 1 ] ||
2767 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2770 echo "Trigger layout LFSCK on all devices again to cleanup"
2771 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2773 for k in $(seq $MDSCOUNT); do
2774 # The LFSCK status query internal is 30 seconds. For the case
2775 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2776 # time to guarantee the status sync up.
2777 wait_update_facet mds${k} "$LCTL get_param -n \
2778 mdd.$(facet_svc mds${k}).lfsck_layout |
2779 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2780 error "(8) MDS${k} is not the expected 'completed'"
2783 for k in $(seq $OSTCOUNT); do
2784 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2785 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2786 awk '/^status/ { print $2 }')
2787 [ "$cur_status" == "completed" ] ||
2788 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2792 local repaired=$(do_facet mds1 $LCTL get_param -n \
2793 mdd.$(facet_svc mds1).lfsck_layout |
2794 awk '/^repaired_orphan/ { print $2 }')
2795 [ $repaired -eq 2 ] ||
2796 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2798 if [ $MDSCOUNT -ge 2 ]; then
2799 repaired=$(do_facet mds2 $LCTL get_param -n \
2800 mdd.$(facet_svc mds2).lfsck_layout |
2801 awk '/^repaired_orphan/ { print $2 }')
2802 [ $repaired -eq 2 ] ||
2803 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2806 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2809 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2812 echo "The target MDT-object is lost, but related OI mapping is there"
2813 echo "The LFSCK should recreate the lost MDT-object without affected"
2814 echo "by the stale OI mapping."
2817 check_mount_and_prep
2818 $LFS mkdir -i 0 $DIR/$tdir/a1
2819 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2820 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2821 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2823 $LFS getstripe $DIR/$tdir/a1/f1
2824 cancel_lru_locks osc
2826 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2827 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2828 do_facet mds1 $LCTL set_param fail_loc=0x162e
2829 rm -f $DIR/$tdir/a1/f1
2831 do_facet mds1 $LCTL set_param fail_loc=0
2832 cancel_lru_locks mdc
2833 cancel_lru_locks osc
2835 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2836 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2838 for k in $(seq $MDSCOUNT); do
2839 # The LFSCK status query internal is 30 seconds. For the case
2840 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2841 # time to guarantee the status sync up.
2842 wait_update_facet mds${k} "$LCTL get_param -n \
2843 mdd.$(facet_svc mds${k}).lfsck_layout |
2844 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2845 error "(2) MDS${k} is not the expected 'completed'"
2848 for k in $(seq $OSTCOUNT); do
2849 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2850 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2851 awk '/^status/ { print $2 }')
2852 [ "$cur_status" == "completed" ] ||
2853 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2856 local repaired=$(do_facet mds1 $LCTL get_param -n \
2857 mdd.$(facet_svc mds1).lfsck_layout |
2858 awk '/^repaired_orphan/ { print $2 }')
2859 [ $repaired -eq $OSTCOUNT ] ||
2860 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2862 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2863 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2864 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2866 $LFS path2fid $DIR/$tdir/a1/f1
2867 $LFS getstripe $DIR/$tdir/a1/f1
2869 run_test 18g "Find out orphan OST-object and repair it (7)"
2873 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2874 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2875 echo "scanning its OST-object(s). Then in the second stage scanning,"
2876 echo "the OST will return related OST-object(s) to the MDT as orphan."
2877 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2878 echo "the 'orphan(s)' stripe information."
2881 check_mount_and_prep
2883 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2884 error "(0) Fail to create PFL $DIR/$tdir/f0"
2886 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2887 error "(1.1) Fail to write $DIR/$tdir/f0"
2889 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2890 error "(1.2) Fail to write $DIR/$tdir/f0"
2892 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2894 echo "Inject failure stub to simulate bad PFL extent range"
2895 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2896 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2898 chown 1.1 $DIR/$tdir/f0
2900 cancel_lru_locks mdc
2901 cancel_lru_locks osc
2902 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2904 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2905 error "(2) Write to bad PFL file should fail"
2907 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2908 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2910 for k in $(seq $MDSCOUNT); do
2911 # The LFSCK status query internal is 30 seconds. For the case
2912 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2913 # time to guarantee the status sync up.
2914 wait_update_facet mds${k} "$LCTL get_param -n \
2915 mdd.$(facet_svc mds${k}).lfsck_layout |
2916 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2917 error "(4.1) MDS${k} is not the expected 'completed'"
2920 for k in $(seq $OSTCOUNT); do
2921 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2922 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2923 awk '/^status/ { print $2 }')
2924 [ "$cur_status" == "completed" ] ||
2925 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2929 local repaired=$($SHOW_LAYOUT |
2930 awk '/^repaired_orphan/ { print $2 }')
2931 [ $repaired -eq 2 ] ||
2932 error "(5) Fail to repair crashed PFL range: $repaired"
2934 echo "Data in $DIR/$tdir/f0 should not be broken"
2935 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2936 error "(6) Data in $DIR/$tdir/f0 is broken"
2938 echo "Write should succeed after LFSCK repairing the bad PFL range"
2939 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2940 error "(7) Write should succeed after LFSCK"
2942 run_test 18h "LFSCK can repair crashed PFL extent range"
2944 $LCTL set_param debug=-cache > /dev/null
2947 check_mount_and_prep
2948 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2950 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2951 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2953 echo "foo1" > $DIR/$tdir/a0
2954 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2955 error "(0) Fail to create PFL $DIR/$tdir/a1"
2956 echo "foo2" > $DIR/$tdir/a1
2957 echo "guard" > $DIR/$tdir/a2
2958 cancel_lru_locks osc
2960 echo "Inject failure, then client will offer wrong parent FID when read"
2961 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2962 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2964 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2965 $LCTL set_param fail_loc=0x1619
2967 echo "Read RPC with wrong parent FID should be denied"
2968 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2969 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2970 $LCTL set_param fail_loc=0
2972 run_test 19a "OST-object inconsistency self detect"
2975 check_mount_and_prep
2976 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2978 echo "Inject failure stub to make the OST-object to back point to"
2979 echo "non-exist MDT-object"
2981 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2982 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2984 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2985 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2986 echo "foo1" > $DIR/$tdir/f0
2987 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2988 error "(0) Fail to create PFL $DIR/$tdir/f1"
2989 echo "foo2" > $DIR/$tdir/f1
2990 cancel_lru_locks osc
2991 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2993 do_facet ost1 $LCTL set_param -n \
2994 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2995 echo "Nothing should be fixed since self detect and repair is disabled"
2996 local repaired=$(do_facet ost1 $LCTL get_param -n \
2997 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2998 awk '/^repaired/ { print $2 }')
2999 [ $repaired -eq 0 ] ||
3000 error "(1) Expected 0 repaired, but got $repaired"
3002 echo "Read RPC with right parent FID should be accepted,"
3003 echo "and cause parent FID on OST to be fixed"
3005 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3006 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3008 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3009 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3011 repaired=$(do_facet ost1 $LCTL get_param -n \
3012 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3013 awk '/^repaired/ { print $2 }')
3014 [ $repaired -eq 2 ] ||
3015 error "(3) Expected 1 repaired, but got $repaired"
3017 run_test 19b "OST-object inconsistency self repair"
3019 PATTERN_WITH_HOLE="40000001"
3020 PATTERN_WITHOUT_HOLE="raid0"
3023 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3024 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3027 echo "The target MDT-object and some of its OST-object are lost."
3028 echo "The LFSCK should find out the left OST-objects and re-create"
3029 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3030 echo "with the partial OST-objects (LOV EA hole)."
3032 echo "New client can access the file with LOV EA hole via normal"
3033 echo "system tools or commands without crash the system."
3035 echo "For old client, even though it cannot access the file with"
3036 echo "LOV EA hole, it should not cause the system crash."
3039 check_mount_and_prep
3040 $LFS mkdir -i 0 $DIR/$tdir/a1
3041 if [ $OSTCOUNT -gt 2 ]; then
3042 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3045 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3049 # 256 blocks on the stripe0.
3050 # 1 block on the stripe1 for 2 OSTs case.
3051 # 256 blocks on the stripe1 for other cases.
3052 # 1 block on the stripe2 if OSTs > 2
3053 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3054 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3055 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3057 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3058 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3059 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3062 $LFS getstripe $DIR/$tdir/a1/f0
3064 $LFS getstripe $DIR/$tdir/a1/f1
3066 $LFS getstripe $DIR/$tdir/a1/f2
3068 if [ $OSTCOUNT -gt 2 ]; then
3069 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3070 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3072 $LFS getstripe $DIR/$tdir/a1/f3
3075 cancel_lru_locks osc
3077 echo "Inject failure..."
3078 echo "To simulate f0 lost MDT-object"
3079 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3080 do_facet mds1 $LCTL set_param fail_loc=0x1616
3081 rm -f $DIR/$tdir/a1/f0
3083 echo "To simulate f1 lost MDT-object and OST-object0"
3084 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3085 do_facet mds1 $LCTL set_param fail_loc=0x161a
3086 rm -f $DIR/$tdir/a1/f1
3088 echo "To simulate f2 lost MDT-object and OST-object1"
3089 do_facet mds1 $LCTL set_param fail_val=1
3090 rm -f $DIR/$tdir/a1/f2
3092 if [ $OSTCOUNT -gt 2 ]; then
3093 echo "To simulate f3 lost MDT-object and OST-object2"
3094 do_facet mds1 $LCTL set_param fail_val=2
3095 rm -f $DIR/$tdir/a1/f3
3098 umount_client $MOUNT
3101 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3103 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3104 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3106 for k in $(seq $MDSCOUNT); do
3107 # The LFSCK status query internal is 30 seconds. For the case
3108 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3109 # time to guarantee the status sync up.
3110 wait_update_facet mds${k} "$LCTL get_param -n \
3111 mdd.$(facet_svc mds${k}).lfsck_layout |
3112 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3113 error "(2) MDS${k} is not the expected 'completed'"
3116 for k in $(seq $OSTCOUNT); do
3117 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3118 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3119 awk '/^status/ { print $2 }')
3120 [ "$cur_status" == "completed" ] ||
3121 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3124 local repaired=$(do_facet mds1 $LCTL get_param -n \
3125 mdd.$(facet_svc mds1).lfsck_layout |
3126 awk '/^repaired_orphan/ { print $2 }')
3127 if [ $OSTCOUNT -gt 2 ]; then
3128 [ $repaired -eq 9 ] ||
3129 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3131 [ $repaired -eq 4 ] ||
3132 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3135 mount_client $MOUNT || error "(5.0) Fail to start client!"
3137 LOV_PATTERN_F_HOLE=0x40000000
3140 # ${fid0}-R-0 is the old f0
3142 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3143 echo "Check $name, which is the old f0"
3145 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3147 local pattern=$($LFS getstripe -L $name)
3148 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3149 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3151 local stripes=$($LFS getstripe -c $name)
3152 if [ $OSTCOUNT -gt 2 ]; then
3153 [ $stripes -eq 3 ] ||
3154 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3156 [ $stripes -eq 2 ] ||
3157 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3160 local size=$(stat $name | awk '/Size:/ { print $2 }')
3161 [ $size -eq $((4096 * $bcount)) ] ||
3162 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3164 cat $name > /dev/null || error "(5.5) cannot read $name"
3166 echo "dummy" >> $name || error "(5.6) cannot write $name"
3168 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3170 touch $name || error "(5.8) cannot touch $name"
3172 rm -f $name || error "(5.9) cannot unlink $name"
3175 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3177 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3178 if [ $OSTCOUNT -gt 2 ]; then
3179 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3181 echo "Check $name, it contains the old f1's stripe1"
3184 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3186 pattern=$($LFS getstripe -L $name)
3187 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3188 error "(6.2) expect pattern flag hole, but got $pattern"
3190 stripes=$($LFS getstripe -c $name)
3191 if [ $OSTCOUNT -gt 2 ]; then
3192 [ $stripes -eq 3 ] ||
3193 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3195 [ $stripes -eq 2 ] ||
3196 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3199 size=$(stat $name | awk '/Size:/ { print $2 }')
3200 [ $size -eq $((4096 * $bcount)) ] ||
3201 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3203 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3205 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3206 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3209 [ $failures -eq 256 ] ||
3210 error "(6.6) expect 256 IO failures, but get $failures"
3212 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3213 [ $size -eq $((4096 * $bcount)) ] ||
3214 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3216 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3217 error "(6.8) write to the LOV EA hole should fail"
3219 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3220 error "(6.9) write to normal stripe should NOT fail"
3222 echo "foo" >> $name && error "(6.10) append write $name should fail"
3224 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3226 touch $name || error "(6.12) cannot touch $name"
3228 rm -f $name || error "(6.13) cannot unlink $name"
3231 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3233 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3234 if [ $OSTCOUNT -gt 2 ]; then
3235 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3237 echo "Check $name, it contains the old f2's stripe0"
3240 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3242 pattern=$($LFS getstripe -L $name)
3243 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3244 error "(7.2) expect pattern flag hole, but got $pattern"
3246 stripes=$($LFS getstripe -c $name)
3247 size=$(stat $name | awk '/Size:/ { print $2 }')
3248 if [ $OSTCOUNT -gt 2 ]; then
3249 [ $stripes -eq 3 ] ||
3250 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3252 [ $size -eq $((4096 * $bcount)) ] ||
3253 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3255 cat $name > /dev/null &&
3256 error "(7.5.1) normal read $name should fail"
3258 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3259 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3261 [ $failures -eq 256 ] ||
3262 error "(7.6) expect 256 IO failures, but get $failures"
3264 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3265 [ $size -eq $((4096 * $bcount)) ] ||
3266 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3268 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3269 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3271 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3272 error "(7.8.1) write to normal stripe should NOT fail"
3274 echo "foo" >> $name &&
3275 error "(7.8.3) append write $name should fail"
3277 chown $RUNAS_ID:$RUNAS_GID $name ||
3278 error "(7.9.1) cannot chown on $name"
3280 touch $name || error "(7.10.1) cannot touch $name"
3282 [ $stripes -eq 2 ] ||
3283 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3286 [ $size -eq $((4096 * (256 + 0))) ] ||
3287 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3289 cat $name > /dev/null &&
3290 error "(7.5.2) normal read $name should fail"
3292 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3293 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3294 [ $failures -eq 256 ] ||
3295 error "(7.6.2) expect 256 IO failures, but get $failures"
3298 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3299 [ $size -eq $((4096 * $bcount)) ] ||
3300 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3302 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3303 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3305 chown $RUNAS_ID:$RUNAS_GID $name ||
3306 error "(7.9.2) cannot chown on $name"
3308 touch $name || error "(7.10.2) cannot touch $name"
3311 rm -f $name || error "(7.11) cannot unlink $name"
3313 [ $OSTCOUNT -le 2 ] && return
3316 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3318 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3319 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3321 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3323 pattern=$($LFS getstripe -L $name)
3324 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3325 error "(8.2) expect pattern flag hole, but got $pattern"
3327 stripes=$($LFS getstripe -c $name)
3328 [ $stripes -eq 3 ] ||
3329 error "(8.3) expect the stripe count is 3, but got $stripes"
3331 size=$(stat $name | awk '/Size:/ { print $2 }')
3333 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3334 error "(8.4) expect the size $((4096 * 512)), but got $size"
3336 cat $name > /dev/null &&
3337 error "(8.5) normal read $name should fail"
3339 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3340 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3342 [ $failures -eq 256 ] ||
3343 error "(8.6) expect 256 IO failures, but get $failures"
3346 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3347 [ $size -eq $((4096 * $bcount)) ] ||
3348 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3350 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3351 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3353 chown $RUNAS_ID:$RUNAS_GID $name ||
3354 error "(8.9) cannot chown on $name"
3356 touch $name || error "(8.10) cannot touch $name"
3358 rm -f $name || error "(8.11) cannot unlink $name"
3360 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3363 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3364 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3367 echo "The target MDT-object and some of its OST-object are lost."
3368 echo "The LFSCK should find out the left OST-objects and re-create"
3369 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3370 echo "with the partial OST-objects (LOV EA hole)."
3372 echo "New client can access the file with LOV EA hole via normal"
3373 echo "system tools or commands without crash the system - PFL case."
3376 check_mount_and_prep
3378 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3379 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3380 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3381 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3382 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3383 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3385 local bcount=$((256 * 3 + 1))
3387 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3388 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3389 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3391 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3392 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3393 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3396 $LFS getstripe $DIR/$tdir/f0
3398 $LFS getstripe $DIR/$tdir/f1
3400 $LFS getstripe $DIR/$tdir/f2
3402 cancel_lru_locks mdc
3403 cancel_lru_locks osc
3405 echo "Inject failure..."
3406 echo "To simulate f0 lost MDT-object"
3407 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3411 echo "To simulate the case of f1 lost MDT-object and "
3412 echo "the first OST-object in each PFL component"
3413 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3414 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3417 echo "To simulate the case of f2 lost MDT-object and "
3418 echo "the second OST-object in each PFL component"
3419 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3424 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3426 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3427 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3429 for k in $(seq $MDSCOUNT); do
3430 # The LFSCK status query internal is 30 seconds. For the case
3431 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3432 # time to guarantee the status sync up.
3433 wait_update_facet mds${k} "$LCTL get_param -n \
3434 mdd.$(facet_svc mds${k}).lfsck_layout |
3435 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3436 error "(4) MDS${k} is not the expected 'completed'"
3439 for k in $(seq $OSTCOUNT); do
3440 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3441 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3442 awk '/^status/ { print $2 }')
3443 [ "$cur_status" == "completed" ] ||
3444 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3447 local repaired=$(do_facet mds1 $LCTL get_param -n \
3448 mdd.$(facet_svc mds1).lfsck_layout |
3449 awk '/^repaired_orphan/ { print $2 }')
3450 [ $repaired -eq 8 ] ||
3451 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3454 # ${fid0}-R-0 is the old f0
3456 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3457 echo "Check $name, which is the old f0"
3459 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3461 local pattern=$($LFS getstripe -L -I1 $name)
3462 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3463 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3465 pattern=$($LFS getstripe -L -I2 $name)
3466 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3467 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3469 local stripes=$($LFS getstripe -c -I1 $name)
3470 [ $stripes -eq 2 ] ||
3471 error "(7.3.1) expect 2 stripes, but got $stripes"
3473 stripes=$($LFS getstripe -c -I2 $name)
3474 [ $stripes -eq 2 ] ||
3475 error "(7.3.2) expect 2 stripes, but got $stripes"
3477 local e_start=$($LFS getstripe -I1 $name |
3478 awk '/lcme_extent.e_start:/ { print $2 }')
3479 [ $e_start -eq 0 ] ||
3480 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3482 local e_end=$($LFS getstripe -I1 $name |
3483 awk '/lcme_extent.e_end:/ { print $2 }')
3484 [ $e_end -eq 2097152 ] ||
3485 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3487 e_start=$($LFS getstripe -I2 $name |
3488 awk '/lcme_extent.e_start:/ { print $2 }')
3489 [ $e_start -eq 2097152 ] ||
3490 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3492 e_end=$($LFS getstripe -I2 $name |
3493 awk '/lcme_extent.e_end:/ { print $2 }')
3494 [ "$e_end" = "EOF" ] ||
3495 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3497 local size=$(stat $name | awk '/Size:/ { print $2 }')
3498 [ $size -eq $((4096 * $bcount)) ] ||
3499 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3501 cat $name > /dev/null || error "(7.7) cannot read $name"
3503 echo "dummy" >> $name || error "(7.8) cannot write $name"
3505 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3507 touch $name || error "(7.10) cannot touch $name"
3509 rm -f $name || error "(7.11) cannot unlink $name"
3512 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3514 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3515 echo "Check $name, it contains f1's second OST-object in each COMP"
3517 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3519 pattern=$($LFS getstripe -L -I1 $name)
3520 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3521 error "(8.2.1) expect pattern flag hole, but got $pattern"
3523 pattern=$($LFS getstripe -L -I2 $name)
3524 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3525 error "(8.2.2) expect pattern flag hole, but got $pattern"
3527 stripes=$($LFS getstripe -c -I1 $name)
3528 [ $stripes -eq 2 ] ||
3529 error "(8.3.2) expect 2 stripes, but got $stripes"
3531 stripes=$($LFS getstripe -c -I2 $name)
3532 [ $stripes -eq 2 ] ||
3533 error "(8.3.2) expect 2 stripes, but got $stripes"
3535 e_start=$($LFS getstripe -I1 $name |
3536 awk '/lcme_extent.e_start:/ { print $2 }')
3537 [ $e_start -eq 0 ] ||
3538 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3540 e_end=$($LFS getstripe -I1 $name |
3541 awk '/lcme_extent.e_end:/ { print $2 }')
3542 [ $e_end -eq 2097152 ] ||
3543 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3545 e_start=$($LFS getstripe -I2 $name |
3546 awk '/lcme_extent.e_start:/ { print $2 }')
3547 [ $e_start -eq 2097152 ] ||
3548 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3550 e_end=$($LFS getstripe -I2 $name |
3551 awk '/lcme_extent.e_end:/ { print $2 }')
3552 [ "$e_end" = "EOF" ] ||
3553 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3555 size=$(stat $name | awk '/Size:/ { print $2 }')
3556 [ $size -eq $((4096 * $bcount)) ] ||
3557 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3559 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3561 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3562 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3564 # The first stripe in each COMP was lost
3565 [ $failures -eq 512 ] ||
3566 error "(8.8) expect 512 IO failures, but get $failures"
3568 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3569 [ $size -eq $((4096 * $bcount)) ] ||
3570 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3572 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3573 error "(8.10) write to the LOV EA hole should fail"
3575 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3576 error "(8.11) write to normal stripe should NOT fail"
3578 echo "foo" >> $name && error "(8.12) append write $name should fail"
3580 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3582 touch $name || error "(8.14) cannot touch $name"
3584 rm -f $name || error "(8.15) cannot unlink $name"
3587 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3589 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3590 echo "Check $name, it contains f2's first stripe in each COMP"
3592 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3594 pattern=$($LFS getstripe -L -I1 $name)
3595 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3596 error "(9.2.1) expect pattern flag hole, but got $pattern"
3598 pattern=$($LFS getstripe -L -I2 $name)
3599 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3600 error "(9.2.2) expect pattern flag hole, but got $pattern"
3602 stripes=$($LFS getstripe -c -I1 $name)
3603 [ $stripes -eq 2 ] ||
3604 error "(9.3.2) expect 2 stripes, but got $stripes"
3606 stripes=$($LFS getstripe -c -I2 $name)
3607 [ $stripes -eq 2 ] ||
3608 error "(9.3.2) expect 2 stripes, but got $stripes"
3610 e_start=$($LFS getstripe -I1 $name |
3611 awk '/lcme_extent.e_start:/ { print $2 }')
3612 [ $e_start -eq 0 ] ||
3613 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3615 e_end=$($LFS getstripe -I1 $name |
3616 awk '/lcme_extent.e_end:/ { print $2 }')
3617 [ $e_end -eq 2097152 ] ||
3618 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3620 e_start=$($LFS getstripe -I2 $name |
3621 awk '/lcme_extent.e_start:/ { print $2 }')
3622 [ $e_start -eq 2097152 ] ||
3623 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3625 e_end=$($LFS getstripe -I2 $name |
3626 awk '/lcme_extent.e_end:/ { print $2 }')
3627 [ "$e_end" = "EOF" ] ||
3628 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3630 size=$(stat $name | awk '/Size:/ { print $2 }')
3631 # The second stripe in COMP was lost, so we do not know there
3632 # have ever been some data before. 'stat' will regard it as
3633 # no data on the lost stripe.
3635 [ $size -eq $((4096 * $bcount)) ] ||
3636 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3638 cat $name > /dev/null &&
3639 error "(9.7) normal read $name should fail"
3641 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3642 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3643 [ $failures -eq 512 ] ||
3644 error "(9.8) expect 256 IO failures, but get $failures"
3646 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3647 # The second stripe in COMP was lost, so we do not know there
3648 # have ever been some data before. Since 'dd' skip failure,
3649 # it will regard the lost stripe contains data.
3651 [ $size -eq $((4096 * $bcount)) ] ||
3652 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3654 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3655 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3657 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3658 error "(9.11) write to normal stripe should NOT fail"
3660 echo "foo" >> $name &&
3661 error "(9.12) append write $name should fail"
3663 chown $RUNAS_ID:$RUNAS_GID $name ||
3664 error "(9.13) cannot chown on $name"
3666 touch $name || error "(9.14) cannot touch $name"
3668 rm -f $name || error "(7.15) cannot unlink $name"
3670 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3673 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3674 skip "ignore the test if MDS is older than 2.5.59" && return
3676 check_mount_and_prep
3677 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3679 echo "Start all LFSCK components by default (-s 1)"
3680 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3681 error "Fail to start LFSCK"
3683 echo "namespace LFSCK should be in 'scanning-phase1' status"
3684 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3685 [ "$STATUS" == "scanning-phase1" ] ||
3686 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3688 echo "layout LFSCK should be in 'scanning-phase1' status"
3689 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3690 [ "$STATUS" == "scanning-phase1" ] ||
3691 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3693 echo "Stop all LFSCK components by default"
3694 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3695 error "Fail to stop LFSCK"
3697 run_test 21 "run all LFSCK components by default"
3700 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3703 echo "The parent_A references the child directory via some name entry,"
3704 echo "but the child directory back references another parent_B via its"
3705 echo "".." name entry. The parent_B does not exist. Then the namespace"
3706 echo "LFSCK will repair the child directory's ".." name entry."
3709 check_mount_and_prep
3711 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3712 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3714 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3715 echo "The dummy's dotdot name entry references the guard."
3716 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3717 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3718 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3719 error "(3) Fail to mkdir on MDT0"
3720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3722 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3724 echo "Trigger namespace LFSCK to repair unmatched pairs"
3725 $START_NAMESPACE -A -r ||
3726 error "(5) Fail to start LFSCK for namespace"
3728 wait_all_targets_blocked namespace completed 6
3730 local repaired=$($SHOW_NAMESPACE |
3731 awk '/^unmatched_pairs_repaired/ { print $2 }')
3732 [ $repaired -eq 1 ] ||
3733 error "(7) Fail to repair unmatched pairs: $repaired"
3735 echo "'ls' should success after namespace LFSCK repairing"
3736 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3737 error "(8) ls should success."
3739 run_test 22a "LFSCK can repair unmatched pairs (1)"
3742 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3745 echo "The parent_A references the child directory via the name entry_B,"
3746 echo "but the child directory back references another parent_C via its"
3747 echo "".." name entry. The parent_C exists, but there is no the name"
3748 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3749 echo "the child directory's ".." name entry and its linkEA."
3752 check_mount_and_prep
3754 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3755 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3757 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3758 echo "and bad linkEA. The dummy's dotdot name entry references the"
3759 echo "guard. The dummy's linkEA references n non-exist name entry."
3760 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3762 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3763 error "(3) Fail to mkdir on MDT0"
3764 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3766 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3767 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3768 local dummyname=$($LFS fid2path $DIR $dummyfid)
3769 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3770 error "(4) fid2path works unexpectedly."
3772 echo "Trigger namespace LFSCK to repair unmatched pairs"
3773 $START_NAMESPACE -A -r ||
3774 error "(5) Fail to start LFSCK for namespace"
3776 wait_all_targets_blocked namespace completed 6
3778 local repaired=$($SHOW_NAMESPACE |
3779 awk '/^unmatched_pairs_repaired/ { print $2 }')
3780 [ $repaired -eq 1 ] ||
3781 error "(7) Fail to repair unmatched pairs: $repaired"
3783 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3784 local dummyname=$($LFS fid2path $DIR $dummyfid)
3785 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3786 error "(8) fid2path does not work"
3788 run_test 22b "LFSCK can repair unmatched pairs (2)"
3791 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3794 echo "The name entry is there, but the MDT-object for such name "
3795 echo "entry does not exist. The namespace LFSCK should find out "
3796 echo "and repair the inconsistency as required."
3799 check_mount_and_prep
3801 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3802 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3804 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3805 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3806 do_facet mds2 $LCTL set_param fail_loc=0x1620
3807 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3808 do_facet mds2 $LCTL set_param fail_loc=0
3810 echo "'ls' should fail because of dangling name entry"
3811 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3813 echo "Trigger namespace LFSCK to find out dangling name entry"
3814 $START_NAMESPACE -A -r ||
3815 error "(5) Fail to start LFSCK for namespace"
3817 wait_all_targets_blocked namespace completed 6
3819 local repaired=$($SHOW_NAMESPACE |
3820 awk '/^dangling_repaired/ { print $2 }')
3821 [ $repaired -eq 1 ] ||
3822 error "(7) Fail to repair dangling name entry: $repaired"
3824 echo "'ls' should fail because not re-create MDT-object by default"
3825 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3827 echo "Trigger namespace LFSCK again to repair dangling name entry"
3828 $START_NAMESPACE -A -r -C ||
3829 error "(9) Fail to start LFSCK for namespace"
3831 wait_all_targets_blocked namespace completed 10
3833 repaired=$($SHOW_NAMESPACE |
3834 awk '/^dangling_repaired/ { print $2 }')
3835 [ $repaired -eq 1 ] ||
3836 error "(11) Fail to repair dangling name entry: $repaired"
3838 echo "'ls' should success after namespace LFSCK repairing"
3839 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3841 run_test 23a "LFSCK can repair dangling name entry (1)"
3845 echo "The objectA has multiple hard links, one of them corresponding"
3846 echo "to the name entry_B. But there is something wrong for the name"
3847 echo "entry_B and cause entry_B to references non-exist object_C."
3848 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3849 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3850 echo "comes to the second-stage scanning, it will find that the"
3851 echo "former re-creating object_C is not proper, and will try to"
3852 echo "replace the object_C with the real object_A."
3855 check_mount_and_prep
3857 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3858 $LFS path2fid $DIR/$tdir/d0
3860 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3862 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3863 $LFS path2fid $DIR/$tdir/d0/f0
3865 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3866 $LFS path2fid $DIR/$tdir/d0/f1
3868 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3869 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3871 if [ "$SEQ0" != "$SEQ1" ]; then
3872 # To guarantee that the f0 and f1 are in the same FID seq
3873 rm -f $DIR/$tdir/d0/f0 ||
3874 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3875 echo "dummy" > $DIR/$tdir/d0/f0 ||
3876 error "(3.2) Fail to touch on MDT0"
3877 $LFS path2fid $DIR/$tdir/d0/f0
3880 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3881 OID=$(printf %d $OID)
3883 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3884 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3885 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3886 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3887 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3889 # If there is creation after the dangling injection, it may re-use
3890 # the just released local object (inode) that is referenced by the
3891 # dangling name entry. It will fail the dangling injection.
3892 # So before deleting the target object for the dangling name entry,
3893 # remove some other objects to avoid the target object being reused
3894 # by some potential creations. LU-7429
3895 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3897 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3899 echo "'ls' should fail because of dangling name entry"
3900 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3901 error "(6) ls should fail."
3903 echo "Trigger namespace LFSCK to find out dangling name entry"
3904 $START_NAMESPACE -r -C ||
3905 error "(7) Fail to start LFSCK for namespace"
3907 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3908 mdd.${MDT_DEV}.lfsck_namespace |
3909 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3911 error "(8) unexpected status"
3914 local repaired=$($SHOW_NAMESPACE |
3915 awk '/^dangling_repaired/ { print $2 }')
3916 [ $repaired -eq 1 ] ||
3917 error "(9) Fail to repair dangling name entry: $repaired"
3919 repaired=$($SHOW_NAMESPACE |
3920 awk '/^multiple_linked_repaired/ { print $2 }')
3921 [ $repaired -eq 1 ] ||
3922 error "(10) Fail to drop the former created object: $repaired"
3924 local data=$(cat $DIR/$tdir/d0/foo)
3925 [ "$data" == "dummy" ] ||
3926 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3928 run_test 23b "LFSCK can repair dangling name entry (2)"
3931 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3932 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3933 mdd.${MDT_DEV}.lfsck_namespace |
3934 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3936 error "(10) unexpected status"
3939 stop_full_debug_logging
3944 echo "The objectA has multiple hard links, one of them corresponding"
3945 echo "to the name entry_B. But there is something wrong for the name"
3946 echo "entry_B and cause entry_B to references non-exist object_C."
3947 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3948 echo "as dangling, and re-create the lost object_C. And then others"
3949 echo "modified the re-created object_C. When the LFSCK comes to the"
3950 echo "second-stage scanning, it will find that the former re-creating"
3951 echo "object_C maybe wrong and try to replace the object_C with the"
3952 echo "real object_A. But because object_C has been modified, so the"
3953 echo "LFSCK cannot replace it."
3956 start_full_debug_logging
3958 check_mount_and_prep
3960 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3961 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3962 echo "parent_fid=$parent_fid"
3964 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3966 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3967 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3968 echo "f0_fid=$f0_fid"
3970 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3971 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3972 echo "f1_fid=$f1_fid"
3974 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3975 # To guarantee that the f0 and f1 are in the same FID seq
3976 rm -f $DIR/$tdir/d0/f0 ||
3977 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3978 echo "dummy" > $DIR/$tdir/d0/f0 ||
3979 error "(3.2) Fail to touch on MDT0"
3980 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3981 echo "f0_fid=$f0_fid (replaced)"
3984 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3986 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3987 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3988 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3989 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3990 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3992 # If there is creation after the dangling injection, it may re-use
3993 # the just released local object (inode) that is referenced by the
3994 # dangling name entry. It will fail the dangling injection.
3995 # So before deleting the target object for the dangling name entry,
3996 # remove some other objects to avoid the target object being reused
3997 # by some potential creations. LU-7429
3998 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4000 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4002 echo "'ls' should fail because of dangling name entry"
4003 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4004 error "(6) ls should fail."
4006 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4007 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4009 echo "Trigger namespace LFSCK to find out dangling name entry"
4010 $START_NAMESPACE -r -C ||
4011 error "(7) Fail to start LFSCK for namespace"
4013 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4014 # While unexpected by the test, it is valid for LFSCK to repair
4015 # the link to the original object before any data is written.
4016 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4018 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4019 log "LFSCK repaired file prematurely"
4024 stat $DIR/$tdir/d0/foo
4026 error "(8) unexpected size"
4029 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4030 cancel_lru_locks osc
4034 local repaired=$($SHOW_NAMESPACE |
4035 awk '/^dangling_repaired/ { print $2 }')
4036 [ $repaired -eq 1 ] ||
4037 error "(11) Fail to repair dangling name entry: $repaired"
4039 local data=$(cat $DIR/$tdir/d0/foo)
4040 [ "$data" != "dummy" ] ||
4041 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4043 run_test 23c "LFSCK can repair dangling name entry (3)"
4046 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4047 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4050 echo "Two MDT-objects back reference the same name entry via their"
4051 echo "each own linkEA entry, but the name entry only references one"
4052 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4053 echo "for the MDT-object that is not recognized. If such MDT-object"
4054 echo "has no other linkEA entry after the removing, then the LFSCK"
4055 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4058 check_mount_and_prep
4060 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4062 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4063 $LFS path2fid $DIR/$tdir/d0/guard
4065 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4066 $LFS path2fid $DIR/$tdir/d0/dummy
4069 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4070 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4072 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4075 touch $DIR/$tdir/d0/guard/foo ||
4076 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4078 echo "Inject failure stub on MDT0 to simulate the case that"
4079 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4080 echo "that references $DIR/$tdir/d0/guard/foo."
4081 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4082 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4083 echo "there with the same linkEA entry as another MDT-object"
4084 echo "$DIR/$tdir/d0/guard/foo has"
4086 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4088 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4089 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4090 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4091 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4092 rmdir $DIR/$tdir/d0/dummy/foo ||
4093 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4094 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4096 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4097 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4098 error "(6) stat successfully unexpectedly"
4100 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4101 $START_NAMESPACE -A -r ||
4102 error "(7) Fail to start LFSCK for namespace"
4104 wait_all_targets_blocked namespace completed 8
4106 local repaired=$($SHOW_NAMESPACE |
4107 awk '/^multiple_referenced_repaired/ { print $2 }')
4108 [ $repaired -eq 1 ] ||
4109 error "(9) Fail to repair multiple referenced name entry: $repaired"
4111 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4112 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4113 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4115 local cname="$cfid-$pfid-D-0"
4116 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4117 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4119 run_test 24 "LFSCK can repair multiple-referenced name entry"
4122 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4123 skip "ldiskfs only test" && return
4126 echo "The file type in the name entry does not match the file type"
4127 echo "claimed by the referenced object. Then the LFSCK will update"
4128 echo "the file type in the name entry."
4131 check_mount_and_prep
4133 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4135 echo "Inject failure stub on MDT0 to simulate the case that"
4136 echo "the file type stored in the name entry is wrong."
4138 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4139 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4140 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4143 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4144 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4146 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4147 mdd.${MDT_DEV}.lfsck_namespace |
4148 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4150 error "(4) unexpected status"
4153 local repaired=$($SHOW_NAMESPACE |
4154 awk '/^bad_file_type_repaired/ { print $2 }')
4155 [ $repaired -eq 1 ] ||
4156 error "(5) Fail to repair bad file type in name entry: $repaired"
4158 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4160 run_test 25 "LFSCK can repair bad file type in the name entry"
4164 echo "The local name entry back referenced by the MDT-object is lost."
4165 echo "The namespace LFSCK will add the missing local name entry back"
4166 echo "to the normal namespace."
4169 check_mount_and_prep
4171 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4172 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4173 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4175 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4176 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4178 echo "Inject failure stub on MDT0 to simulate the case that"
4179 echo "foo's name entry will be removed, but the foo's object"
4180 echo "and its linkEA are kept in the system."
4182 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4184 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4187 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4188 error "(5) 'ls' should fail"
4190 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4191 $START_NAMESPACE -r -A ||
4192 error "(6) Fail to start LFSCK for namespace"
4194 wait_all_targets_blocked namespace completed 7
4196 local repaired=$($SHOW_NAMESPACE |
4197 awk '/^lost_dirent_repaired/ { print $2 }')
4198 [ $repaired -eq 1 ] ||
4199 error "(8) Fail to repair lost dirent: $repaired"
4201 ls -ail $DIR/$tdir/d0/foo ||
4202 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4204 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4205 [ "$foofid" == "$foofid2" ] ||
4206 error "(10) foo's FID changed: $foofid, $foofid2"
4208 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4211 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4214 echo "The remote name entry back referenced by the MDT-object is lost."
4215 echo "The namespace LFSCK will add the missing remote name entry back"
4216 echo "to the normal namespace."
4219 check_mount_and_prep
4221 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4222 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4223 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4225 echo "Inject failure stub on MDT0 to simulate the case that"
4226 echo "foo's name entry will be removed, but the foo's object"
4227 echo "and its linkEA are kept in the system."
4229 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4230 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4231 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4232 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4234 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4235 error "(4) 'ls' should fail"
4237 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4238 $START_NAMESPACE -r -A ||
4239 error "(5) Fail to start LFSCK for namespace"
4241 wait_all_targets_blocked namespace completed 6
4243 local repaired=$($SHOW_NAMESPACE |
4244 awk '/^lost_dirent_repaired/ { print $2 }')
4245 [ $repaired -eq 1 ] ||
4246 error "(7) Fail to repair lost dirent: $repaired"
4248 ls -ail $DIR/$tdir/d0/foo ||
4249 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4251 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4252 [ "$foofid" == "$foofid2" ] ||
4253 error "(9) foo's FID changed: $foofid, $foofid2"
4255 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4258 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4261 echo "The local parent referenced by the MDT-object linkEA is lost."
4262 echo "The namespace LFSCK will re-create the lost parent as orphan."
4265 check_mount_and_prep
4267 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4268 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4269 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4270 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4272 echo "Inject failure stub on MDT0 to simulate the case that"
4273 echo "foo's name entry will be removed, but the foo's object"
4274 echo "and its linkEA are kept in the system. And then remove"
4275 echo "another hard link and the parent directory."
4277 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4278 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4279 rm -f $DIR/$tdir/d0/foo ||
4280 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4281 rm -f $DIR/$tdir/d0/dummy ||
4282 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4283 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4285 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4286 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4288 echo "Trigger namespace LFSCK to repair the lost parent"
4289 $START_NAMESPACE -r -A ||
4290 error "(6) Fail to start LFSCK for namespace"
4292 wait_all_targets_blocked namespace completed 7
4294 local repaired=$($SHOW_NAMESPACE |
4295 awk '/^lost_dirent_repaired/ { print $2 }')
4296 [ $repaired -eq 1 ] ||
4297 error "(8) Fail to repair lost dirent: $repaired"
4299 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4300 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4301 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4303 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4305 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4306 [ ! -z "$cname" ] ||
4307 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4309 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4312 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4313 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4316 echo "The remote parent referenced by the MDT-object linkEA is lost."
4317 echo "The namespace LFSCK will re-create the lost parent as orphan."
4320 check_mount_and_prep
4322 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4323 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4325 $LFS path2fid $DIR/$tdir/d0
4327 echo "Inject failure stub on MDT0 to simulate the case that"
4328 echo "foo's name entry will be removed, but the foo's object"
4329 echo "and its linkEA are kept in the system. And then remove"
4330 echo "the parent directory."
4332 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4334 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4337 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4338 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4340 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4341 $START_NAMESPACE -r -A ||
4342 error "(6) Fail to start LFSCK for namespace"
4344 wait_all_targets_blocked namespace completed 7
4346 local repaired=$($SHOW_NAMESPACE |
4347 awk '/^lost_dirent_repaired/ { print $2 }')
4348 [ $repaired -eq 1 ] ||
4349 error "(8) Fail to repair lost dirent: $repaired"
4351 ls -ail $MOUNT/.lustre/lost+found/
4353 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4354 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4355 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4357 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4359 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4360 [ ! -z "$cname" ] ||
4361 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4363 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4366 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4369 echo "The target name entry is lost. The LFSCK should insert the"
4370 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4371 echo "the MDT (on which the orphan MDT-object resides) has ever"
4372 echo "failed to respond some name entry verification during the"
4373 echo "first stage-scanning, then the LFSCK should skip to handle"
4374 echo "orphan MDT-object on this MDT. But other MDTs should not"
4378 check_mount_and_prep
4379 $LFS mkdir -i 0 $DIR/$tdir/d1
4380 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4381 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4383 $LFS mkdir -i 1 $DIR/$tdir/d2
4384 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4385 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4387 echo "Inject failure stub on MDT0 to simulate the case that"
4388 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4389 echo "and its linkEA are kept in the system. And the case that"
4390 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4391 echo "and its linkEA are kept in the system."
4393 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4394 do_facet mds1 $LCTL set_param fail_loc=0x1624
4395 do_facet mds2 $LCTL set_param fail_loc=0x1624
4396 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4397 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4398 do_facet mds1 $LCTL set_param fail_loc=0
4399 do_facet mds2 $LCTL set_param fail_loc=0
4401 cancel_lru_locks mdc
4402 cancel_lru_locks osc
4404 echo "Inject failure, to simulate the MDT0 fail to handle"
4405 echo "MDT1 LFSCK request during the first-stage scanning."
4406 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4407 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4409 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4410 $START_NAMESPACE -r -A ||
4411 error "(3) Fail to start LFSCK for namespace"
4413 wait_update_facet mds1 "$LCTL get_param -n \
4414 mdd.$(facet_svc mds1).lfsck_namespace |
4415 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4416 error "(4) mds1 is not the expected 'partial'"
4419 wait_update_facet mds2 "$LCTL get_param -n \
4420 mdd.$(facet_svc mds2).lfsck_namespace |
4421 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4422 error "(5) mds2 is not the expected 'completed'"
4425 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4427 local repaired=$(do_facet mds1 $LCTL get_param -n \
4428 mdd.$(facet_svc mds1).lfsck_namespace |
4429 awk '/^lost_dirent_repaired/ { print $2 }')
4430 [ $repaired -eq 0 ] ||
4431 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4433 repaired=$(do_facet mds2 $LCTL get_param -n \
4434 mdd.$(facet_svc mds2).lfsck_namespace |
4435 awk '/^lost_dirent_repaired/ { print $2 }')
4436 [ $repaired -eq 1 ] ||
4437 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4439 echo "Trigger namespace LFSCK on all devices again to cleanup"
4440 $START_NAMESPACE -r -A ||
4441 error "(8) Fail to start LFSCK for namespace"
4443 wait_all_targets_blocked namespace completed 9
4445 local repaired=$(do_facet mds1 $LCTL get_param -n \
4446 mdd.$(facet_svc mds1).lfsck_namespace |
4447 awk '/^lost_dirent_repaired/ { print $2 }')
4448 [ $repaired -eq 1 ] ||
4449 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4451 repaired=$(do_facet mds2 $LCTL get_param -n \
4452 mdd.$(facet_svc mds2).lfsck_namespace |
4453 awk '/^lost_dirent_repaired/ { print $2 }')
4454 [ $repaired -eq 0 ] ||
4455 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4457 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4461 echo "The object's nlink attribute is larger than the object's known"
4462 echo "name entries count. The LFSCK will repair the object's nlink"
4463 echo "attribute to match the known name entries count"
4466 check_mount_and_prep
4468 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4469 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4471 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4472 echo "nlink attribute is larger than its name entries count."
4474 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4475 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4476 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4477 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4478 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4480 cancel_lru_locks mdc
4481 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4482 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4484 echo "Trigger namespace LFSCK to repair the nlink count"
4485 $START_NAMESPACE -r -A ||
4486 error "(5) Fail to start LFSCK for namespace"
4488 wait_all_targets_blocked namespace completed 6
4490 local repaired=$($SHOW_NAMESPACE |
4491 awk '/^nlinks_repaired/ { print $2 }')
4492 [ $repaired -eq 1 ] ||
4493 error "(7) Fail to repair nlink count: $repaired"
4495 cancel_lru_locks mdc
4496 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4497 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4499 # Disable 29a, we only allow nlink to be updated if the known linkEA
4500 # entries is larger than nlink count.
4502 #run_test 29a "LFSCK can repair bad nlink count (1)"
4506 echo "The object's nlink attribute is smaller than the object's known"
4507 echo "name entries count. The LFSCK will repair the object's nlink"
4508 echo "attribute to match the known name entries count"
4511 check_mount_and_prep
4513 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4514 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4516 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4517 echo "nlink attribute is smaller than its name entries count."
4519 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4521 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4522 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4525 cancel_lru_locks mdc
4526 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4527 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4529 echo "Trigger namespace LFSCK to repair the nlink count"
4530 $START_NAMESPACE -r -A ||
4531 error "(5) Fail to start LFSCK for namespace"
4533 wait_all_targets_blocked namespace completed 6
4535 local repaired=$($SHOW_NAMESPACE |
4536 awk '/^nlinks_repaired/ { print $2 }')
4537 [ $repaired -eq 1 ] ||
4538 error "(7) Fail to repair nlink count: $repaired"
4540 cancel_lru_locks mdc
4541 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4542 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4544 run_test 29b "LFSCK can repair bad nlink count (2)"
4549 echo "The namespace LFSCK will create many hard links to the target"
4550 echo "file as to exceed the linkEA size limitation. Under such case"
4551 echo "the linkEA will be marked as overflow that will prevent the"
4552 echo "target file to be migrated. Then remove some hard links to"
4553 echo "make the left hard links to be held within the linkEA size"
4554 echo "limitation. But before the namespace LFSCK adding all the"
4555 echo "missed linkEA entries back, the overflow mark (timestamp)"
4556 echo "will not be cleared."
4559 check_mount_and_prep
4561 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4562 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4563 error "(0.2) Fail to mkdir"
4564 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4565 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4567 # define MAX_LINKEA_SIZE 4096
4568 # sizeof(link_ea_header) = 24
4569 # sizeof(link_ea_entry) = 18
4570 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4571 # (sizeof(link_ea_entry) + name_length))
4572 # If the average name length is 12 bytes, then 150 hard links
4573 # is totally enough to overflow the linkEA
4574 echo "Create 150 hard links should succeed although the linkEA overflow"
4575 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4576 error "(2) Fail to hard link"
4578 cancel_lru_locks mdc
4579 if [ $MDSCOUNT -ge 2 ]; then
4580 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4581 error "(3.1) Migrate should fail"
4583 echo "The object with linkEA overflow should NOT be migrated"
4584 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4585 [ "$newfid" == "$oldfid" ] ||
4586 error "(3.2) Migrate should fail: $newfid != $oldfid"
4589 # Remove 100 hard links, then the linkEA should have space
4590 # to hold the missed linkEA entries.
4591 echo "Remove 100 hard links to save space for the missed linkEA entries"
4592 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4594 if [ $MDSCOUNT -ge 2 ]; then
4595 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4596 error "(5.1) Migrate should fail"
4598 # The overflow timestamp is still there, so migration will fail.
4599 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4600 [ "$newfid" == "$oldfid" ] ||
4601 error "(5.2) Migrate should fail: $newfid != $oldfid"
4604 # sleep 3 seconds to guarantee that the overflow is recognized
4607 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4608 $START_NAMESPACE -r -A ||
4609 error "(6) Fail to start LFSCK for namespace"
4611 wait_all_targets_blocked namespace completed 7
4613 local repaired=$($SHOW_NAMESPACE |
4614 awk '/^linkea_overflow_cleared/ { print $2 }')
4615 [ $repaired -eq 1 ] ||
4616 error "(8) Fail to clear linkea overflow: $repaired"
4618 repaired=$($SHOW_NAMESPACE |
4619 awk '/^nlinks_repaired/ { print $2 }')
4620 [ $repaired -eq 0 ] ||
4621 error "(9) Unexpected nlink repaired: $repaired"
4623 if [ $MDSCOUNT -ge 2 ]; then
4624 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4625 error "(10.1) Migrate failure"
4627 # Migration should succeed after clear the overflow timestamp.
4628 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4629 [ "$newfid" != "$oldfid" ] ||
4630 error "(10.2) Migrate should succeed"
4632 ls -l $DIR/$tdir/foo > /dev/null ||
4633 error "(11) 'ls' failed after migration"
4636 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4637 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4639 run_test 29c "verify linkEA size limitation"
4642 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4643 skip "ldiskfs only test" && return
4644 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4647 echo "The namespace LFSCK will move the orphans from backend"
4648 echo "/lost+found directory to normal client visible namespace"
4649 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4652 check_mount_and_prep
4654 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4655 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4657 echo "Inject failure stub on MDT0 to simulate the case that"
4658 echo "directory d0 has no linkEA entry, then the LFSCK will"
4659 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4661 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4662 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4663 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4664 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4666 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4667 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4669 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4670 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4672 echo "Inject failure stub on MDT0 to simulate the case that the"
4673 echo "object's name entry will be removed, but not destroy the"
4674 echo "object. Then backend e2fsck will handle it as orphan and"
4675 echo "add them into the backend /lost+found directory."
4677 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4678 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4679 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4680 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4681 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4682 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4683 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4685 umount_client $MOUNT || error "(10) Fail to stop client!"
4687 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4690 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4691 error "(12) Fail to run e2fsck"
4693 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4694 error "(13) Fail to start MDT0"
4696 echo "Trigger namespace LFSCK to recover backend orphans"
4697 $START_NAMESPACE -r -A ||
4698 error "(14) Fail to start LFSCK for namespace"
4700 wait_all_targets_blocked namespace completed 15
4702 local repaired=$($SHOW_NAMESPACE |
4703 awk '/^local_lost_found_moved/ { print $2 }')
4704 [ $repaired -ge 4 ] ||
4705 error "(16) Fail to recover backend orphans: $repaired"
4707 mount_client $MOUNT || error "(17) Fail to start client!"
4709 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4711 ls -ail $MOUNT/.lustre/lost+found/
4713 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4714 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4715 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4717 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4719 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4720 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4722 stat ${cname}/d1 || error "(21) d1 is not recovered"
4723 stat ${cname}/f1 || error "(22) f1 is not recovered"
4725 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4728 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4731 echo "For the name entry under a striped directory, if the name"
4732 echo "hash does not match the shard, then the LFSCK will repair"
4733 echo "the bad name entry"
4736 check_mount_and_prep
4738 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4739 error "(1) Fail to create striped directory"
4741 echo "Inject failure stub on client to simulate the case that"
4742 echo "some name entry should be inserted into other non-first"
4743 echo "shard, but inserted into the first shard by wrong"
4745 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4746 $LCTL set_param fail_loc=0x1628 fail_val=0
4747 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4748 error "(2) Fail to create file under striped directory"
4749 $LCTL set_param fail_loc=0 fail_val=0
4751 echo "Trigger namespace LFSCK to repair bad name hash"
4752 $START_NAMESPACE -r -A ||
4753 error "(3) Fail to start LFSCK for namespace"
4755 wait_all_targets_blocked namespace completed 4
4757 local repaired=$($SHOW_NAMESPACE |
4758 awk '/^name_hash_repaired/ { print $2 }')
4759 [ $repaired -ge 1 ] ||
4760 error "(5) Fail to repair bad name hash: $repaired"
4762 umount_client $MOUNT || error "(6) umount failed"
4763 mount_client $MOUNT || error "(7) mount failed"
4765 for ((i = 0; i < $MDSCOUNT; i++)); do
4766 stat $DIR/$tdir/striped_dir/d$i ||
4767 error "(8) Fail to stat d$i after LFSCK"
4768 rmdir $DIR/$tdir/striped_dir/d$i ||
4769 error "(9) Fail to unlink d$i after LFSCK"
4772 rmdir $DIR/$tdir/striped_dir ||
4773 error "(10) Fail to remove the striped directory after LFSCK"
4775 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4778 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4781 echo "For the name entry under a striped directory, if the name"
4782 echo "hash does not match the shard, then the LFSCK will repair"
4783 echo "the bad name entry"
4786 check_mount_and_prep
4788 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4789 error "(1) Fail to create striped directory"
4791 echo "Inject failure stub on client to simulate the case that"
4792 echo "some name entry should be inserted into other non-second"
4793 echo "shard, but inserted into the secod shard by wrong"
4795 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4796 $LCTL set_param fail_loc=0x1628 fail_val=1
4797 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4798 error "(2) Fail to create file under striped directory"
4799 $LCTL set_param fail_loc=0 fail_val=0
4801 echo "Trigger namespace LFSCK to repair bad name hash"
4802 $START_NAMESPACE -r -A ||
4803 error "(3) Fail to start LFSCK for namespace"
4805 wait_all_targets_blocked namespace completed 4
4807 local repaired=$(do_facet mds2 $LCTL get_param -n \
4808 mdd.$(facet_svc mds2).lfsck_namespace |
4809 awk '/^name_hash_repaired/ { print $2 }')
4810 echo "repaired $repaired name entries with bad hash"
4811 [ $repaired -ge 1 ] ||
4812 error "(5) Fail to repair bad name hash: $repaired"
4814 umount_client $MOUNT || error "(6) umount failed"
4815 mount_client $MOUNT || error "(7) mount failed"
4817 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4818 stat $DIR/$tdir/striped_dir/d$i ||
4819 error "(8) Fail to stat d$i after LFSCK"
4820 rmdir $DIR/$tdir/striped_dir/d$i ||
4821 error "(9) Fail to unlink d$i after LFSCK"
4824 rmdir $DIR/$tdir/striped_dir ||
4825 error "(10) Fail to remove the striped directory after LFSCK"
4827 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4830 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4833 echo "For some reason, the master MDT-object of the striped directory"
4834 echo "may lost its master LMV EA. If nobody created files under the"
4835 echo "master directly after the master LMV EA lost, then the LFSCK"
4836 echo "should re-generate the master LMV EA."
4839 check_mount_and_prep
4841 echo "Inject failure stub on MDT0 to simulate the case that the"
4842 echo "master MDT-object of the striped directory lost the LMV EA."
4844 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4846 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4847 error "(1) Fail to create striped directory"
4848 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4850 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4851 $START_NAMESPACE -r -A ||
4852 error "(2) Fail to start LFSCK for namespace"
4854 wait_all_targets_blocked namespace completed 3
4856 local repaired=$($SHOW_NAMESPACE |
4857 awk '/^striped_dirs_repaired/ { print $2 }')
4858 [ $repaired -eq 1 ] ||
4859 error "(4) Fail to re-generate master LMV EA: $repaired"
4861 umount_client $MOUNT || error "(5) umount failed"
4862 mount_client $MOUNT || error "(6) mount failed"
4864 local empty=$(ls $DIR/$tdir/striped_dir/)
4865 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4867 rmdir $DIR/$tdir/striped_dir ||
4868 error "(8) Fail to remove the striped directory after LFSCK"
4870 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4873 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4876 echo "For some reason, the master MDT-object of the striped directory"
4877 echo "may lost its master LMV EA. If somebody created files under the"
4878 echo "master directly after the master LMV EA lost, then the LFSCK"
4879 echo "should NOT re-generate the master LMV EA, instead, it should"
4880 echo "change the broken striped dirctory as read-only to prevent"
4881 echo "further damage"
4884 check_mount_and_prep
4886 echo "Inject failure stub on MDT0 to simulate the case that the"
4887 echo "master MDT-object of the striped directory lost the LMV EA."
4889 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4891 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4892 error "(1) Fail to create striped directory"
4893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4895 umount_client $MOUNT || error "(2) umount failed"
4896 mount_client $MOUNT || error "(3) mount failed"
4898 touch $DIR/$tdir/striped_dir/dummy ||
4899 error "(4) Fail to touch under broken striped directory"
4901 echo "Trigger namespace LFSCK to find out the inconsistency"
4902 $START_NAMESPACE -r -A ||
4903 error "(5) Fail to start LFSCK for namespace"
4905 wait_all_targets_blocked namespace completed 6
4907 local repaired=$($SHOW_NAMESPACE |
4908 awk '/^striped_dirs_repaired/ { print $2 }')
4909 [ $repaired -eq 0 ] ||
4910 error "(7) Re-generate master LMV EA unexpected: $repaired"
4912 stat $DIR/$tdir/striped_dir/dummy ||
4913 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4915 touch $DIR/$tdir/striped_dir/foo &&
4916 error "(9) The broken striped directory should be read-only"
4918 chattr -i $DIR/$tdir/striped_dir ||
4919 error "(10) Fail to chattr on the broken striped directory"
4921 rmdir $DIR/$tdir/striped_dir ||
4922 error "(11) Fail to remove the striped directory after LFSCK"
4924 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4927 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4930 echo "For some reason, the slave MDT-object of the striped directory"
4931 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4932 echo "slave LMV EA."
4935 check_mount_and_prep
4937 echo "Inject failure stub on MDT0 to simulate the case that the"
4938 echo "slave MDT-object (that resides on the same MDT as the master"
4939 echo "MDT-object resides on) lost the LMV EA."
4941 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4942 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4943 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4944 error "(1) Fail to create striped directory"
4945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4947 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4948 $START_NAMESPACE -r -A ||
4949 error "(2) Fail to start LFSCK for namespace"
4951 wait_all_targets_blocked namespace completed 3
4953 local repaired=$($SHOW_NAMESPACE |
4954 awk '/^striped_shards_repaired/ { print $2 }')
4955 [ $repaired -eq 1 ] ||
4956 error "(4) Fail to re-generate slave LMV EA: $repaired"
4958 rmdir $DIR/$tdir/striped_dir ||
4959 error "(5) Fail to remove the striped directory after LFSCK"
4961 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4964 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4967 echo "For some reason, the slave MDT-object of the striped directory"
4968 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4969 echo "slave LMV EA."
4972 check_mount_and_prep
4974 echo "Inject failure stub on MDT0 to simulate the case that the"
4975 echo "slave MDT-object (that resides on different MDT as the master"
4976 echo "MDT-object resides on) lost the LMV EA."
4978 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4979 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4980 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4981 error "(1) Fail to create striped directory"
4982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4984 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4985 $START_NAMESPACE -r -A ||
4986 error "(2) Fail to start LFSCK for namespace"
4988 wait_all_targets_blocked namespace completed 3
4990 local repaired=$(do_facet mds2 $LCTL get_param -n \
4991 mdd.$(facet_svc mds2).lfsck_namespace |
4992 awk '/^striped_shards_repaired/ { print $2 }')
4993 [ $repaired -eq 1 ] ||
4994 error "(4) Fail to re-generate slave LMV EA: $repaired"
4996 rmdir $DIR/$tdir/striped_dir ||
4997 error "(5) Fail to remove the striped directory after LFSCK"
4999 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5002 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5005 echo "For some reason, the stripe index in the slave LMV EA is"
5006 echo "corrupted. The LFSCK should repair the slave LMV EA."
5009 check_mount_and_prep
5011 echo "Inject failure stub on MDT0 to simulate the case that the"
5012 echo "slave LMV EA on the first shard of the striped directory"
5013 echo "claims the same index as the second shard claims"
5015 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5016 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5017 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5018 error "(1) Fail to create striped directory"
5019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5021 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5022 $START_NAMESPACE -r -A ||
5023 error "(2) Fail to start LFSCK for namespace"
5025 wait_all_targets_blocked namespace completed 3
5027 local repaired=$($SHOW_NAMESPACE |
5028 awk '/^striped_shards_repaired/ { print $2 }')
5029 [ $repaired -eq 1 ] ||
5030 error "(4) Fail to repair slave LMV EA: $repaired"
5032 umount_client $MOUNT || error "(5) umount failed"
5033 mount_client $MOUNT || error "(6) mount failed"
5035 touch $DIR/$tdir/striped_dir/foo ||
5036 error "(7) Fail to touch file after the LFSCK"
5038 rm -f $DIR/$tdir/striped_dir/foo ||
5039 error "(8) Fail to unlink file after the LFSCK"
5041 rmdir $DIR/$tdir/striped_dir ||
5042 error "(9) Fail to remove the striped directory after LFSCK"
5044 run_test 31g "Repair the corrupted slave LMV EA"
5047 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5050 echo "For some reason, the shard's name entry in the striped"
5051 echo "directory may be corrupted. The LFSCK should repair the"
5052 echo "bad shard's name entry."
5055 check_mount_and_prep
5057 echo "Inject failure stub on MDT0 to simulate the case that the"
5058 echo "first shard's name entry in the striped directory claims"
5059 echo "the same index as the second shard's name entry claims."
5061 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5062 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5063 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5064 error "(1) Fail to create striped directory"
5065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5067 echo "Trigger namespace LFSCK to repair the shard's name entry"
5068 $START_NAMESPACE -r -A ||
5069 error "(2) Fail to start LFSCK for namespace"
5071 wait_all_targets_blocked namespace completed 3
5073 local repaired=$($SHOW_NAMESPACE |
5074 awk '/^dirent_repaired/ { print $2 }')
5075 [ $repaired -eq 1 ] ||
5076 error "(4) Fail to repair shard's name entry: $repaired"
5078 umount_client $MOUNT || error "(5) umount failed"
5079 mount_client $MOUNT || error "(6) mount failed"
5081 touch $DIR/$tdir/striped_dir/foo ||
5082 error "(7) Fail to touch file after the LFSCK"
5084 rm -f $DIR/$tdir/striped_dir/foo ||
5085 error "(8) Fail to unlink file after the LFSCK"
5087 rmdir $DIR/$tdir/striped_dir ||
5088 error "(9) Fail to remove the striped directory after LFSCK"
5090 run_test 31h "Repair the corrupted shard's name entry"
5095 umount_client $MOUNT
5097 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5098 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5099 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5101 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5102 [ "$STATUS" == "scanning-phase1" ] ||
5103 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5106 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5108 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5112 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5114 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5115 error "(5) Fail to start ost1"
5117 run_test 32a "stop LFSCK when some OST failed"
5121 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5124 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5125 error "(1) Fail to create $DIR/$tdir/dp"
5126 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5127 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5128 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5129 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5130 umount_client $MOUNT
5132 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5133 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5134 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5136 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5137 mdd.${MDT_DEV}.lfsck_namespace |
5138 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5140 error "(5) unexpected status"
5144 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5146 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5150 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5152 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5153 error "(8) Fail to start MDT2"
5155 run_test 32b "stop LFSCK when some MDT failed"
5161 $START_LAYOUT --dryrun -o -r ||
5162 error "(1) Fail to start layout LFSCK"
5163 wait_all_targets_blocked layout completed 2
5165 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5166 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5167 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5169 $START_NAMESPACE -e abort -A -r ||
5170 error "(4) Fail to start namespace LFSCK"
5171 wait_all_targets_blocked namespace completed 5
5173 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5174 [ "$PARAMS" == "failout,all_targets" ] ||
5175 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5177 run_test 33 "check LFSCK paramters"
5181 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5182 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5183 skip "Only valid for ZFS backend" && return
5187 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5189 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5190 error "(1) Fail to create $DIR/$tdir/dummy"
5192 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5193 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5194 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5195 mdd.${MDT_DEV}.lfsck_namespace |
5196 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5198 error "(3) unexpected status"
5201 local repaired=$($SHOW_NAMESPACE |
5202 awk '/^dirent_repaired/ { print $2 }')
5203 [ $repaired -eq 1 ] ||
5204 error "(4) Fail to repair the lost agent object: $repaired"
5206 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5207 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5208 mdd.${MDT_DEV}.lfsck_namespace |
5209 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5211 error "(6) unexpected status"
5214 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5215 [ $repaired -eq 0 ] ||
5216 error "(7) Unexpected repairing: $repaired"
5218 run_test 34 "LFSCK can rebuild the lost agent object"
5222 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5226 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5227 do_facet mds2 $LCTL set_param fail_loc=0x1631
5228 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5229 error "(1) Fail to create $DIR/$tdir/dummy"
5232 do_facet mds2 $LCTL set_param fail_loc=0
5233 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5234 wait_update_facet mds2 "$LCTL get_param -n \
5235 mdd.$(facet_svc mds2).lfsck_namespace |
5236 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5237 error "(3) MDS${k} is not the expected 'completed'"
5239 local repaired=$(do_facet mds2 $LCTL get_param -n \
5240 mdd.$(facet_svc mds2).lfsck_namespace |
5241 awk '/^agent_entries_repaired/ { print $2 }')
5242 [ $repaired -eq 1 ] ||
5243 error "(4) Fail to repair the lost agent entry: $repaired"
5245 echo "stopall to cleanup object cache"
5248 setupall > /dev/null
5250 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5251 wait_update_facet mds2 "$LCTL get_param -n \
5252 mdd.$(facet_svc mds2).lfsck_namespace |
5253 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5254 error "(6) MDS${k} is not the expected 'completed'"
5256 repaired=$(do_facet mds2 $LCTL get_param -n \
5257 mdd.$(facet_svc mds2).lfsck_namespace |
5258 awk '/^agent_entries_repaired/ { print $2 }')
5259 [ $repaired -eq 0 ] ||
5260 error "(7) Unexpected repairing: $repaired"
5262 run_test 35 "LFSCK can rebuild the lost agent entry"
5265 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5268 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5269 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5270 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5273 check_mount_and_prep
5277 lctl get_param osc.*.*grant*
5278 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5280 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5281 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5282 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5283 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5284 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5285 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5286 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5287 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5288 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5290 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5291 error "(3) Fail to write $DIR/$tdir/f0"
5292 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5293 error "(4) Fail to write $DIR/$tdir/f1"
5294 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5295 error "(5) Fail to write $DIR/$tdir/f2"
5297 $LFS mirror resync $DIR/$tdir/f0 ||
5298 error "(6) Fail to resync $DIR/$tdir/f0"
5299 $LFS mirror resync $DIR/$tdir/f1 ||
5300 error "(7) Fail to resync $DIR/$tdir/f1"
5301 $LFS mirror resync $DIR/$tdir/f2 ||
5302 error "(8) Fail to resync $DIR/$tdir/f2"
5304 cancel_lru_locks mdc
5305 cancel_lru_locks osc
5307 $LFS getstripe $DIR/$tdir/f0 ||
5308 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5309 $LFS getstripe $DIR/$tdir/f1 ||
5310 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5311 $LFS getstripe $DIR/$tdir/f2 ||
5312 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5314 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5315 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5316 do_facet mds1 $LCTL set_param fail_loc=0x1616
5318 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5319 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5320 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5321 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5322 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5323 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5327 do_facet mds1 $LCTL set_param fail_loc=0
5329 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5330 error "(15) The 1st of mirror is not destroyed"
5331 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5332 error "(16) The 2nd of mirror is not destroyed"
5333 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5334 error "(17) The 3rd of mirror is not destroyed"
5338 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5339 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5340 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5341 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5342 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5343 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5345 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5346 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5348 for k in $(seq $MDSCOUNT); do
5349 # The LFSCK status query internal is 30 seconds. For the case
5350 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5351 # time to guarantee the status sync up.
5352 wait_update_facet mds${k} "$LCTL get_param -n \
5353 mdd.$(facet_svc mds${k}).lfsck_layout |
5354 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5355 error "(22) MDS${k} is not the expected 'completed'"
5358 for k in $(seq $OSTCOUNT); do
5359 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5360 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5361 awk '/^status/ { print $2 }')
5362 [ "$cur_status" == "completed" ] ||
5363 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5366 local repaired=$(do_facet mds1 $LCTL get_param -n \
5367 mdd.$(facet_svc mds1).lfsck_layout |
5368 awk '/^repaired_orphan/ { print $2 }')
5369 [ $repaired -eq 9 ] ||
5370 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5372 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5373 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5374 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5375 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5376 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5377 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5379 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5380 $LFS getstripe $DIR/$tdir/f0
5381 error "(28) The 1st of mirror is not recovered"
5384 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5385 $LFS getstripe $DIR/$tdir/f1
5386 error "(29) The 2nd of mirror is not recovered"
5389 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5390 $LFS getstripe $DIR/$tdir/f2
5391 error "(30) The 3rd of mirror is not recovered"
5394 run_test 36a "rebuild LOV EA for mirrored file (1)"
5397 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5398 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5401 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5402 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5403 echo "with the PFID EA of related OST-object(s) belong to the file. "
5406 check_mount_and_prep
5408 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5409 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5410 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5412 local fid=$($LFS path2fid $DIR/$tdir/f0)
5414 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5415 error "(1) Fail to write $DIR/$tdir/f0"
5416 $LFS mirror resync $DIR/$tdir/f0 ||
5417 error "(2) Fail to resync $DIR/$tdir/f0"
5419 cancel_lru_locks mdc
5420 cancel_lru_locks osc
5422 $LFS getstripe $DIR/$tdir/f0 ||
5423 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5425 echo "Inject failure, to simulate the case of missing the MDT-object"
5426 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5427 do_facet mds1 $LCTL set_param fail_loc=0x1616
5428 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5432 do_facet mds1 $LCTL set_param fail_loc=0
5434 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5435 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5437 for k in $(seq $MDSCOUNT); do
5438 # The LFSCK status query internal is 30 seconds. For the case
5439 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5440 # time to guarantee the status sync up.
5441 wait_update_facet mds${k} "$LCTL get_param -n \
5442 mdd.$(facet_svc mds${k}).lfsck_layout |
5443 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5444 error "(6) MDS${k} is not the expected 'completed'"
5447 for k in $(seq $OSTCOUNT); do
5448 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5449 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5450 awk '/^status/ { print $2 }')
5451 [ "$cur_status" == "completed" ] ||
5452 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5455 local count=$(do_facet mds1 $LCTL get_param -n \
5456 mdd.$(facet_svc mds1).lfsck_layout |
5457 awk '/^repaired_orphan/ { print $2 }')
5458 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5460 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5461 count=$($LFS getstripe --mirror-count $name)
5462 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5464 count=$($LFS getstripe --component-count $name)
5465 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5467 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5468 $LFS getstripe $name
5469 error "(11) The 1st of mirror is not recovered"
5472 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5473 $LFS getstripe $name
5474 error "(12) The 2nd of mirror is not recovered"
5477 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5478 $LFS getstripe $name
5479 error "(13) The 3rd of mirror is not recovered"
5482 run_test 36b "rebuild LOV EA for mirrored file (2)"
5485 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5486 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5489 echo "The mirrored file has been modified, not resynced yet, then "
5490 echo "lost its MDT-object, but relatd OST-objects are still there. "
5491 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5492 echo "with the PFID EA of related OST-object(s) belong to the file. "
5495 check_mount_and_prep
5497 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5499 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5501 local fid=$($LFS path2fid $DIR/$tdir/f0)
5503 # The 1st dd && resync makes all related OST-objects have been written
5504 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5505 error "(1.1) Fail to write $DIR/$tdir/f0"
5506 $LFS mirror resync $DIR/$tdir/f0 ||
5507 error "(1.2) Fail to resync $DIR/$tdir/f0"
5508 # The 2nd dd makes one mirror to be stale
5509 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5510 error "(1.3) Fail to write $DIR/$tdir/f0"
5512 cancel_lru_locks mdc
5513 cancel_lru_locks osc
5515 $LFS getstripe $DIR/$tdir/f0 ||
5516 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5518 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5519 awk '/lcme_flags/ { print $2 }')
5520 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5521 awk '/lcme_flags/ { print $2 }')
5523 echo "Inject failure, to simulate the case of missing the MDT-object"
5524 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5525 do_facet mds1 $LCTL set_param fail_loc=0x1616
5526 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5530 do_facet mds1 $LCTL set_param fail_loc=0
5532 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5533 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5535 for k in $(seq $MDSCOUNT); do
5536 # The LFSCK status query internal is 30 seconds. For the case
5537 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5538 # time to guarantee the status sync up.
5539 wait_update_facet mds${k} "$LCTL get_param -n \
5540 mdd.$(facet_svc mds${k}).lfsck_layout |
5541 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5542 error "(5) MDS${k} is not the expected 'completed'"
5545 for k in $(seq $OSTCOUNT); do
5546 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5547 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5548 awk '/^status/ { print $2 }')
5549 [ "$cur_status" == "completed" ] ||
5550 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5553 local count=$(do_facet mds1 $LCTL get_param -n \
5554 mdd.$(facet_svc mds1).lfsck_layout |
5555 awk '/^repaired_orphan/ { print $2 }')
5556 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5558 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5559 count=$($LFS getstripe --mirror-count $name)
5560 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5562 count=$($LFS getstripe --component-count $name)
5563 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5565 local flags=$($LFS getstripe $name | head -n 10 |
5566 awk '/lcme_flags/ { print $2 }')
5567 [ "$flags" == "$saved_flags1" ] || {
5568 $LFS getstripe $name
5569 error "(10) expect flags $saved_flags1, got $flags"
5572 flags=$($LFS getstripe $name | tail -n 10 |
5573 awk '/lcme_flags/ { print $2 }')
5574 [ "$flags" == "$saved_flags2" ] || {
5575 $LFS getstripe $name
5576 error "(11) expect flags $saved_flags2, got $flags"
5579 run_test 36c "rebuild LOV EA for mirrored file (3)"
5585 local t_dir="$DIR/$tdir/d0"
5586 check_mount_and_prep
5588 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5589 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5593 $START_NAMESPACE -r -A || {
5594 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5596 wait_all_targets_blocked namespace completed 4
5601 run_test 37 "LFSCK must skip a ORPHAN"
5605 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5606 skip "Need MDS version newer than 2.12.51"
5608 test_mkdir $DIR/$tdir
5609 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5610 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5612 # create foreign file
5613 $LFS setstripe --foreign=daos --flags 0xda05 \
5614 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5615 error "$DIR/$tdir/$tfile: create failed"
5617 $LFS getstripe -v $DIR/$tdir/$tfile |
5618 grep "lfm_magic:.*0x0BD70BD0" ||
5619 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5620 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5621 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5622 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5623 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5624 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5625 $LFS getstripe -v $DIR/$tdir/$tfile |
5626 grep "lfm_flags:.*0x0000DA05" ||
5627 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5628 $LFS getstripe $DIR/$tdir/$tfile |
5629 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5630 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5632 # modify striping should fail
5633 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5634 error "$DIR/$tdir/$tfile: setstripe should fail"
5636 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5638 wait_all_targets_blocked namespace completed 1
5640 # check that "global" namespace_repaired == 0 !!!
5641 local repaired=$(do_facet mds1 \
5642 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5643 awk '/^namespace_repaired/ { print \\\$2 }'")
5644 [ $repaired -eq 0 ] ||
5645 error "(2) Expect no namespace repair, but got: $repaired"
5647 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5649 wait_all_targets_blocked layout completed 2
5651 # check that "global" layout_repaired == 0 !!!
5652 local repaired=$(do_facet mds1 \
5653 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5654 awk '/^layout_repaired/ { print \\\$2 }'")
5655 [ $repaired -eq 0 ] ||
5656 error "(2) Expect no layout repair, but got: $repaired"
5658 echo "post-lfsck checks of foreign file"
5660 $LFS getstripe -v $DIR/$tdir/$tfile |
5661 grep "lfm_magic:.*0x0BD70BD0" ||
5662 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5663 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5664 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5665 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5666 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5667 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5668 $LFS getstripe -v $DIR/$tdir/$tfile |
5669 grep "lfm_flags:.*0x0000DA05" ||
5670 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5671 $LFS getstripe $DIR/$tdir/$tfile |
5672 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5673 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5675 # modify striping should fail
5676 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5677 error "$DIR/$tdir/$tfile: setstripe should fail"
5680 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5681 cat /etc/passwd > $DIR/$tdir/$tfile &&
5682 error "$DIR/$tdir/$tfile: write should fail"
5684 #remove foreign file
5685 rm $DIR/$tdir/$tfile ||
5686 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5688 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5692 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5693 skip "Need MDS version newer than 2.12.51"
5695 test_mkdir $DIR/$tdir
5696 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5697 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5699 # create foreign dir
5700 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5701 $DIR/$tdir/${tdir}2 ||
5702 error "$DIR/$tdir/${tdir}2: create failed"
5704 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5705 grep "lfm_magic:.*0x0CD50CD0" ||
5706 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5707 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5708 # - sizeof(lfm_type) - sizeof(lfm_flags)
5709 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5710 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5711 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5712 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5713 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5714 grep "lfm_flags:.*0x0000DA05" ||
5715 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5716 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5717 grep "lfm_value.*${uuid1}@${uuid2}" ||
5718 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5720 # file create in dir should fail
5721 touch $DIR/$tdir/${tdir}2/$tfile &&
5722 "$DIR/${tdir}2: file create should fail"
5725 chmod 777 $DIR/$tdir/${tdir}2 ||
5726 error "$DIR/${tdir}2: chmod failed"
5729 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5730 error "$DIR/${tdir}2: chown failed"
5732 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5734 wait_all_targets_blocked namespace completed 1
5736 # check that "global" namespace_repaired == 0 !!!
5737 local repaired=$(do_facet mds1 \
5738 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5739 awk '/^namespace_repaired/ { print \\\$2 }'")
5740 [ $repaired -eq 0 ] ||
5741 error "(2) Expect nothing to be repaired, but got: $repaired"
5743 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5745 wait_all_targets_blocked layout completed 2
5747 # check that "global" layout_repaired == 0 !!!
5748 local repaired=$(do_facet mds1 \
5749 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5750 awk '/^layout_repaired/ { print \\\$2 }'")
5751 [ $repaired -eq 0 ] ||
5752 error "(2) Expect no layout repair, but got: $repaired"
5754 echo "post-lfsck checks of foreign dir"
5756 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5757 grep "lfm_magic:.*0x0CD50CD0" ||
5758 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5759 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5760 # - sizeof(lfm_type) - sizeof(lfm_flags)
5761 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5762 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5763 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5764 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5765 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5766 grep "lfm_flags:.*0x0000DA05" ||
5767 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5768 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5769 grep "lfm_value.*${uuid1}@${uuid2}" ||
5770 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5772 # file create in dir should fail
5773 touch $DIR/$tdir/${tdir}2/$tfile &&
5774 "$DIR/${tdir}2: file create should fail"
5777 chmod 777 $DIR/$tdir/${tdir}2 ||
5778 error "$DIR/${tdir}2: chmod failed"
5781 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5782 error "$DIR/${tdir}2: chown failed"
5785 rmdir $DIR/$tdir/${tdir}2 ||
5786 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5788 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5791 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5793 check_mount_and_prep
5794 $LFS mkdir -i 1 $DIR/$tdir/dir1
5795 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5797 touch $DIR/$tdir/dir1/f1
5798 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5800 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5801 $LFS migrate -m 0 $DIR/$tdir/dir1
5803 echo "trigger LFSCK for layout"
5804 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5806 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5807 mdd.${MDT_DEV}.lfsck_layout |
5808 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5810 error "(2) unexpected status"
5813 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5815 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5817 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5819 # restore MDS/OST size
5820 MDSSIZE=${SAVED_MDSSIZE}
5821 OSTSIZE=${SAVED_OSTSIZE}
5822 OSTCOUNT=${SAVED_OSTCOUNT}
5824 # cleanup the system at last
5825 REFORMAT="yes" cleanup_and_setup_lustre
5828 check_and_cleanup_lustre