3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test LU-10732 LU-10406
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT 9a 31c"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
46 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
48 # build up a clean test environment.
49 REFORMAT="yes" check_and_setup_lustre
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
218 #define OBD_FAIL_FID_INDIR 0x1501
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
220 touch $DIR/$tdir/dummy
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
224 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
225 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
226 mdd.${MDT_DEV}.lfsck_namespace |
227 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
229 error "(4) unexpected status"
232 local repaired=$($SHOW_NAMESPACE |
233 awk '/^dirent_repaired/ { print $2 }')
234 # for interop with old server
235 [ -z "$repaired" ] &&
236 repaired=$($SHOW_NAMESPACE |
237 awk '/^updated_phase1/ { print $2 }')
239 [ $repaired -eq 1 ] ||
240 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
244 mount_client $MOUNT || error "(6) Fail to start client!"
246 #define OBD_FAIL_FID_LOOKUP 0x1505
247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
248 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
252 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
256 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
257 skip "OI Scrub not implemented for ZFS" && return
261 #define OBD_FAIL_FID_INLMA 0x1502
262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
263 touch $DIR/$tdir/dummy
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
267 #define OBD_FAIL_FID_NOLMA 0x1506
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
269 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
270 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
271 mdd.${MDT_DEV}.lfsck_namespace |
272 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
274 error "(4) unexpected status"
277 local repaired=$($SHOW_NAMESPACE |
278 awk '/^dirent_repaired/ { print $2 }')
279 # for interop with old server
280 [ -z "$repaired" ] &&
281 repaired=$($SHOW_NAMESPACE |
282 awk '/^updated_phase1/ { print $2 }')
284 [ $repaired -eq 1 ] ||
285 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
287 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
290 mount_client $MOUNT || error "(6) Fail to start client!"
292 #define OBD_FAIL_FID_LOOKUP 0x1505
293 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
294 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
298 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
303 #define OBD_FAIL_FID_IGIF 0x1504
304 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
305 touch $DIR/$tdir/dummy
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
309 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
310 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
311 mdd.${MDT_DEV}.lfsck_namespace |
312 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
314 error "(4) unexpected status"
317 local repaired=$($SHOW_NAMESPACE |
318 awk '/^dirent_repaired/ { print $2 }')
319 # for interop with old server
320 [ -z "$repaired" ] &&
321 repaired=$($SHOW_NAMESPACE |
322 awk '/^updated_phase1/ { print $2 }')
324 [ $repaired -eq 1 ] ||
325 error "(5) Fail to repair lost FID-in-dirent: $repaired"
329 mount_client $MOUNT || error "(6) Fail to start client!"
331 #define OBD_FAIL_FID_LOOKUP 0x1505
332 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
333 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
337 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
342 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
343 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
344 touch $DIR/$tdir/dummy
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
348 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
349 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
350 mdd.${MDT_DEV}.lfsck_namespace |
351 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
353 error "(4) unexpected status"
356 local repaired=$($SHOW_NAMESPACE |
357 awk '/^linkea_repaired/ { print $2 }')
358 # for interop with old server
359 [ -z "$repaired" ] &&
360 repaired=$($SHOW_NAMESPACE |
361 awk '/^updated_phase2/ { print $2 }')
363 [ $repaired -eq 1 ] ||
364 error "(5) Fail to repair crashed linkEA: $repaired"
368 mount_client $MOUNT || error "(6) Fail to start client!"
370 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
371 error "(7) Fail to stat $DIR/$tdir/dummy"
373 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
374 local dummyname=$($LFS fid2path $DIR $dummyfid)
375 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
376 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
378 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
384 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
385 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
386 touch $DIR/$tdir/dummy
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
390 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
391 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
392 mdd.${MDT_DEV}.lfsck_namespace |
393 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
395 error "(4) unexpected status"
398 local repaired=$($SHOW_NAMESPACE |
399 awk '/^updated_phase2/ { print $2 }')
400 [ $repaired -eq 1 ] ||
401 error "(5) Fail to repair crashed linkEA: $repaired"
405 mount_client $MOUNT || error "(6) Fail to start client!"
407 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
408 error "(7) Fail to stat $DIR/$tdir/dummy"
410 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
411 local dummyname=$($LFS fid2path $DIR $dummyfid)
412 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
413 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
415 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
421 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
422 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
423 touch $DIR/$tdir/dummy
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
427 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
428 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
429 mdd.${MDT_DEV}.lfsck_namespace |
430 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
432 error "(4) unexpected status"
435 local repaired=$($SHOW_NAMESPACE |
436 awk '/^updated_phase2/ { print $2 }')
437 [ $repaired -eq 1 ] ||
438 error "(5) Fail to repair crashed linkEA: $repaired"
442 mount_client $MOUNT || error "(6) Fail to start client!"
444 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
445 error "(7) Fail to stat $DIR/$tdir/dummy"
447 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
448 local dummyname=$($LFS fid2path $DIR $dummyfid)
449 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
450 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
452 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
458 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
459 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
460 touch $DIR/$tdir/dummy
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
464 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
465 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
466 mdd.${MDT_DEV}.lfsck_namespace |
467 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
469 error "(4) unexpected status"
472 local repaired=$($SHOW_NAMESPACE |
473 awk '/^linkea_repaired/ { print $2 }')
474 [ $repaired -eq 1 ] ||
475 error "(5) Fail to repair crashed linkEA: $repaired"
479 mount_client $MOUNT || error "(6) Fail to start client!"
481 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
482 error "(7) Fail to stat $DIR/$tdir/dummy"
484 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
485 local dummyname=$($LFS fid2path $DIR $dummyfid)
486 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
487 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
489 run_test 2d "LFSCK can recover the missing linkEA entry"
493 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
497 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
499 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
501 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
504 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
506 wait_all_targets_blocked namespace completed 4
508 local repaired=$($SHOW_NAMESPACE |
509 awk '/^linkea_repaired/ { print $2 }')
510 [ $repaired -eq 1 ] ||
511 error "(5) Fail to repair crashed linkEA: $repaired"
513 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
514 local name=$($LFS fid2path $DIR $fid)
515 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
516 error "(6) Fail to repair linkEA: $fid $name"
518 run_test 2e "namespace LFSCK can verify remote object linkEA"
524 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
525 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
526 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
528 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
529 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
530 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
532 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
534 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
536 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
537 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
538 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
542 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
543 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
544 mdd.${MDT_DEV}.lfsck_namespace |
545 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
547 error "(10) unexpected status"
550 local checked=$($SHOW_NAMESPACE |
551 awk '/^checked_phase2/ { print $2 }')
552 [ $checked -ge 4 ] ||
553 error "(11) Fail to check multiple-linked object: $checked"
555 local repaired=$($SHOW_NAMESPACE |
556 awk '/^multiple_linked_repaired/ { print $2 }')
557 [ $repaired -ge 2 ] ||
558 error "(12) Fail to repair multiple-linked object: $repaired"
560 run_test 3 "LFSCK can verify multiple-linked objects"
564 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
565 skip "OI Scrub not implemented for ZFS" && return
568 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
569 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
571 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
572 echo "start $SINGLEMDS with disabling OI scrub"
573 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
574 error "(2) Fail to start MDS!"
576 #define OBD_FAIL_LFSCK_DELAY2 0x1601
577 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
578 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
579 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
580 mdd.${MDT_DEV}.lfsck_namespace |
581 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
583 error "(5) unexpected status"
586 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
587 [ "$STATUS" == "scanning-phase1" ] ||
588 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
590 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
591 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
592 mdd.${MDT_DEV}.lfsck_namespace |
593 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
595 error "(7) unexpected status"
598 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
599 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
601 local repaired=$($SHOW_NAMESPACE |
602 awk '/^dirent_repaired/ { print $2 }')
603 # for interop with old server
604 [ -z "$repaired" ] &&
605 repaired=$($SHOW_NAMESPACE |
606 awk '/^updated_phase1/ { print $2 }')
608 [ $repaired -ge 9 ] ||
609 error "(9) Fail to re-generate FID-in-dirent: $repaired"
613 mount_client $MOUNT || error "(10) Fail to start client!"
615 #define OBD_FAIL_FID_LOOKUP 0x1505
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
617 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
618 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
620 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
624 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
625 skip "OI Scrub not implemented for ZFS" && return
628 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
629 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
631 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
632 echo "start $SINGLEMDS with disabling OI scrub"
633 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
634 error "(2) Fail to start MDS!"
636 #define OBD_FAIL_LFSCK_DELAY2 0x1601
637 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
638 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
639 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
640 mdd.${MDT_DEV}.lfsck_namespace |
641 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
643 error "(5) unexpected status"
646 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
647 [ "$STATUS" == "scanning-phase1" ] ||
648 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
650 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
651 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
652 mdd.${MDT_DEV}.lfsck_namespace |
653 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
655 error "(7) unexpected status"
658 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
659 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
661 local repaired=$($SHOW_NAMESPACE |
662 awk '/^dirent_repaired/ { print $2 }')
663 # for interop with old server
664 [ -z "$repaired" ] &&
665 repaired=$($SHOW_NAMESPACE |
666 awk '/^updated_phase1/ { print $2 }')
668 [ $repaired -ge 2 ] ||
669 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
673 mount_client $MOUNT || error "(10) Fail to start client!"
675 #define OBD_FAIL_FID_LOOKUP 0x1505
676 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
677 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
679 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
682 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
683 local dummyname=$($LFS fid2path $DIR $dummyfid)
684 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
685 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
687 run_test 5 "LFSCK can handle IGIF object upgrading"
692 #define OBD_FAIL_LFSCK_DELAY1 0x1600
693 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
694 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
696 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
697 [ "$STATUS" == "scanning-phase1" ] ||
698 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
700 # Sleep 3 sec to guarantee at least one object processed by LFSCK
702 # Fail the LFSCK to guarantee there is at least one checkpoint
703 #define OBD_FAIL_LFSCK_FATAL1 0x1608
704 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
705 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
706 mdd.${MDT_DEV}.lfsck_namespace |
707 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
709 error "(4) unexpected status"
712 local POS0=$($SHOW_NAMESPACE |
713 awk '/^last_checkpoint_position/ { print $2 }' |
716 #define OBD_FAIL_LFSCK_DELAY1 0x1600
717 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
718 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
720 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
721 [ "$STATUS" == "scanning-phase1" ] ||
722 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
724 local POS1=$($SHOW_NAMESPACE |
725 awk '/^latest_start_position/ { print $2 }' |
727 [[ $POS0 -lt $POS1 ]] ||
728 error "(7) Expect larger than: $POS0, but got $POS1"
730 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
731 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
732 mdd.${MDT_DEV}.lfsck_namespace |
733 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
735 error "(8) unexpected status"
738 run_test 6a "LFSCK resumes from last checkpoint (1)"
743 #define OBD_FAIL_LFSCK_DELAY2 0x1601
744 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
745 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
747 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
748 [ "$STATUS" == "scanning-phase1" ] ||
749 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
751 # Sleep 5 sec to guarantee that we are in the directory scanning
753 # Fail the LFSCK to guarantee there is at least one checkpoint
754 #define OBD_FAIL_LFSCK_FATAL2 0x1609
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
756 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
757 mdd.${MDT_DEV}.lfsck_namespace |
758 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
760 error "(4) unexpected status"
763 local O_POS0=$($SHOW_NAMESPACE |
764 awk '/^last_checkpoint_position/ { print $2 }' |
767 local D_POS0=$($SHOW_NAMESPACE |
768 awk '/^last_checkpoint_position/ { print $4 }')
770 #define OBD_FAIL_LFSCK_DELAY2 0x1601
771 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
772 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
774 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
775 [ "$STATUS" == "scanning-phase1" ] ||
776 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
778 local O_POS1=$($SHOW_NAMESPACE |
779 awk '/^latest_start_position/ { print $2 }' |
781 local D_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $4 }')
784 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
785 [[ $O_POS0 -lt $O_POS1 ]] ||
786 error "(7.1) $O_POS1 is not larger than $O_POS0"
788 [[ $D_POS0 -lt $D_POS1 ]] ||
789 error "(7.2) $D_POS1 is not larger than $D_POS0"
792 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
793 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
794 mdd.${MDT_DEV}.lfsck_namespace |
795 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
797 error "(8) unexpected status"
800 run_test 6b "LFSCK resumes from last checkpoint (2)"
807 #define OBD_FAIL_LFSCK_DELAY2 0x1601
808 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
809 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
811 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
812 [ "$STATUS" == "scanning-phase1" ] ||
813 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
815 # Sleep 3 sec to guarantee at least one object processed by LFSCK
817 echo "stop $SINGLEMDS"
818 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
820 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
821 echo "start $SINGLEMDS"
822 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
823 error "(5) Fail to start MDS!"
825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
826 mdd.${MDT_DEV}.lfsck_namespace |
827 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
829 error "(6) unexpected status"
832 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
838 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
839 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
840 for ((i = 0; i < 20; i++)); do
841 touch $DIR/$tdir/dummy${i}
844 #define OBD_FAIL_LFSCK_DELAY3 0x1602
845 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
846 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
847 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
848 mdd.${MDT_DEV}.lfsck_namespace |
849 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
851 error "(4) unexpected status"
855 echo "stop $SINGLEMDS"
856 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
859 echo "start $SINGLEMDS"
860 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
861 error "(6) Fail to start MDS!"
863 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
864 mdd.${MDT_DEV}.lfsck_namespace |
865 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
867 error "(7) unexpected status"
870 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
875 formatall > /dev/null
881 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
882 [ "$STATUS" == "init" ] ||
883 error "(2) Expect 'init', but got '$STATUS'"
885 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
887 mkdir $DIR/$tdir/crashed
889 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
891 for ((i = 0; i < 5; i++)); do
892 touch $DIR/$tdir/dummy${i}
895 umount_client $MOUNT || error "(3) Fail to stop client!"
897 #define OBD_FAIL_LFSCK_DELAY2 0x1601
898 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
899 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
901 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
902 [ "$STATUS" == "scanning-phase1" ] ||
903 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
905 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
907 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
908 [ "$STATUS" == "stopped" ] ||
909 error "(7) Expect 'stopped', but got '$STATUS'"
911 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
913 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
914 [ "$STATUS" == "scanning-phase1" ] ||
915 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
917 #define OBD_FAIL_LFSCK_FATAL2 0x1609
918 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
919 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
920 mdd.${MDT_DEV}.lfsck_namespace |
921 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
923 error "(10) unexpected status"
926 #define OBD_FAIL_LFSCK_DELAY1 0x1600
927 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
928 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
930 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
931 [ "$STATUS" == "scanning-phase1" ] ||
932 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
934 #define OBD_FAIL_LFSCK_CRASH 0x160a
935 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
938 echo "stop $SINGLEMDS"
939 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
941 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
942 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
944 echo "start $SINGLEMDS"
945 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
946 error "(14) Fail to start MDS!"
948 local timeout=$(max_recovery_time)
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(14.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "crashed" ] ||
965 error "(15) Expect 'crashed', but got '$STATUS'"
967 #define OBD_FAIL_LFSCK_DELAY2 0x1601
968 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
969 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
971 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
972 [ "$STATUS" == "scanning-phase1" ] ||
973 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
975 echo "stop $SINGLEMDS"
976 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
978 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
979 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
981 echo "start $SINGLEMDS"
982 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
983 error "(19) Fail to start MDS!"
986 while [ $timer -lt $timeout ]; do
987 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
988 mdt.${MDT_DEV}.recovery_status |
989 awk '/^status/ { print \\\$2 }'")
990 [ "$STATUS" != "RECOVERING" ] && break;
995 [ $timer != $timeout ] ||
996 error "(19.1) recovery timeout"
998 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
999 [ "$STATUS" == "paused" ] ||
1000 error "(20) Expect 'paused', but got '$STATUS'"
1002 echo "stop $SINGLEMDS"
1003 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1005 echo "start $SINGLEMDS without resume LFSCK"
1006 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1007 error "(20.2) Fail to start MDS!"
1010 while [ $timer -lt $timeout ]; do
1011 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1012 mdt.${MDT_DEV}.recovery_status |
1013 awk '/^status/ { print \\\$2 }'")
1014 [ "$STATUS" != "RECOVERING" ] && break;
1016 timer=$((timer + 1))
1019 [ $timer != $timeout ] ||
1020 error "(20.3) recovery timeout"
1022 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1023 [ "$STATUS" == "paused" ] ||
1024 error "(20.4) Expect 'paused', but got '$STATUS'"
1026 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1027 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1029 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1030 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1031 mdd.${MDT_DEV}.lfsck_namespace |
1032 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1034 error "(22) unexpected status"
1037 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1038 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1039 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1041 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1042 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1043 mdd.${MDT_DEV}.lfsck_namespace |
1044 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1046 error "(24) unexpected status"
1049 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1050 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1052 run_test 8 "LFSCK state machine"
1055 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1056 skip "Testing on UP system, the speed may be inaccurate."
1060 check_mount_and_prep
1061 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1062 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1063 createmany -o $DIR/$tdir/lfsck/f 5000
1065 local BASE_SPEED1=100
1067 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1070 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1071 [ "$STATUS" == "scanning-phase1" ] ||
1072 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1074 local SPEED=$($SHOW_LAYOUT |
1075 awk '/^average_speed_phase1/ { print $2 }')
1077 # There may be time error, normally it should be less than 2 seconds.
1078 # We allow another 20% schedule error.
1080 # MAX_MARGIN = 1.3 = 13 / 10
1081 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1082 RUN_TIME1 * 13 / 10))
1083 [ $SPEED -lt $MAX_SPEED ] || {
1085 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1086 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1089 # adjust speed limit
1090 local BASE_SPEED2=300
1092 do_facet $SINGLEMDS \
1093 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1096 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1097 # MIN_MARGIN = 0.7 = 7 / 10
1098 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1099 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1100 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1101 [ $SPEED -gt $MIN_SPEED ] || {
1102 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1103 error_ignore LU-5624 \
1104 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1107 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1111 # MAX_MARGIN = 1.3 = 13 / 10
1112 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1113 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1114 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1115 [ $SPEED -lt $MAX_SPEED ] || {
1117 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1118 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1119 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1122 do_facet $SINGLEMDS \
1123 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1125 wait_update_facet $SINGLEMDS \
1126 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1127 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1128 error "(7) Failed to get expected 'completed'"
1130 run_test 9a "LFSCK speed control (1)"
1133 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1134 skip "Testing on UP system, the speed may be inaccurate."
1140 echo "Preparing another 50 * 50 files (with error) at $(date)."
1141 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1143 createmany -d $DIR/$tdir/d 50
1144 createmany -m $DIR/$tdir/f 50
1145 for ((i = 0; i < 50; i++)); do
1146 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1149 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1151 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1152 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1153 mdd.${MDT_DEV}.lfsck_namespace |
1154 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1156 error "(5) unexpected status"
1159 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1160 echo "Prepared at $(date)."
1162 local BASE_SPEED1=50
1164 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1167 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1168 [ "$STATUS" == "scanning-phase2" ] ||
1169 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1171 local SPEED=$($SHOW_NAMESPACE |
1172 awk '/^average_speed_phase2/ { print $2 }')
1173 # There may be time error, normally it should be less than 2 seconds.
1174 # We allow another 20% schedule error.
1176 # MAX_MARGIN = 1.3 = 13 / 10
1177 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1178 RUN_TIME1 * 13 / 10))
1179 [ $SPEED -lt $MAX_SPEED ] || {
1181 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1182 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1185 # adjust speed limit
1186 local BASE_SPEED2=150
1188 do_facet $SINGLEMDS \
1189 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1192 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1193 # MIN_MARGIN = 0.7 = 7 / 10
1194 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1195 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1196 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1197 [ $SPEED -gt $MIN_SPEED ] || {
1198 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1199 error_ignore LU-5624 \
1200 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1203 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1207 # MAX_MARGIN = 1.3 = 13 / 10
1208 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1209 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1210 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1211 [ $SPEED -lt $MAX_SPEED ] || {
1213 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1214 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1215 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1218 do_facet $SINGLEMDS \
1219 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1220 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1221 mdd.${MDT_DEV}.lfsck_namespace |
1222 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1224 error "(11) unexpected status"
1227 run_test 9b "LFSCK speed control (2)"
1231 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1232 skip "lookup(..)/linkea on ZFS issue" && return
1236 echo "Preparing more files with error at $(date)."
1237 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1240 for ((i = 0; i < 1000; i = $((i+2)))); do
1241 mkdir -p $DIR/$tdir/d${i}
1242 touch $DIR/$tdir/f${i}
1243 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1246 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1249 for ((i = 1; i < 1000; i = $((i+2)))); do
1250 mkdir -p $DIR/$tdir/d${i}
1251 touch $DIR/$tdir/f${i}
1252 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1256 echo "Prepared at $(date)."
1258 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1260 umount_client $MOUNT
1261 mount_client $MOUNT || error "(3) Fail to start client!"
1263 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1266 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1267 [ "$STATUS" == "scanning-phase1" ] ||
1268 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1270 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1272 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1274 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1276 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1278 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1280 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1282 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1284 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1285 error "(14) Fail to softlink!"
1287 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1288 [ "$STATUS" == "scanning-phase1" ] ||
1289 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1291 do_facet $SINGLEMDS \
1292 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1293 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1294 mdd.${MDT_DEV}.lfsck_namespace |
1295 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1297 error "(16) unexpected status"
1300 run_test 10 "System is available during LFSCK scanning"
1303 ost_remove_lastid() {
1306 local rcmd="do_facet ost${ost}"
1308 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1310 # step 1: local mount
1311 mount_fstype ost${ost} || return 1
1312 # step 2: remove the specified LAST_ID
1313 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1315 unmount_fstype ost${ost} || return 2
1319 check_mount_and_prep
1320 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1321 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1326 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1328 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1329 error "(2) Fail to start ost1"
1331 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1332 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1334 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1335 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1337 wait_update_facet ost1 "$LCTL get_param -n \
1338 obdfilter.${OST_DEV}.lfsck_layout |
1339 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1341 error "(5) unexpected status"
1344 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1346 wait_update_facet ost1 "$LCTL get_param -n \
1347 obdfilter.${OST_DEV}.lfsck_layout |
1348 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1350 error "(6) unexpected status"
1353 echo "the LAST_ID(s) should have been rebuilt"
1354 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1355 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1357 run_test 11a "LFSCK can rebuild lost last_id"
1360 check_mount_and_prep
1361 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1363 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1364 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1365 do_facet ost1 $LCTL set_param fail_loc=0x160d
1367 local count=$(precreated_ost_obj_count 0 0)
1369 createmany -o $DIR/$tdir/f $((count + 32))
1371 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1372 local seq=$(do_facet mds1 $LCTL get_param -n \
1373 osp.${proc_path}.prealloc_last_seq)
1374 local lastid1=$(do_facet ost1 "lctl get_param -n \
1375 obdfilter.${ost1_svc}.last_id" | grep $seq |
1376 awk -F: '{ print $2 }')
1378 umount_client $MOUNT
1379 stop ost1 || error "(1) Fail to stop ost1"
1381 #define OBD_FAIL_OST_ENOSPC 0x215
1382 do_facet ost1 $LCTL set_param fail_loc=0x215
1384 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1385 error "(2) Fail to start ost1"
1387 for ((i = 0; i < 60; i++)); do
1388 lastid2=$(do_facet ost1 "lctl get_param -n \
1389 obdfilter.${ost1_svc}.last_id" | grep $seq |
1390 awk -F: '{ print $2 }')
1391 [ ! -z $lastid2 ] && break;
1395 echo "the on-disk LAST_ID should be smaller than the expected one"
1396 [ $lastid1 -gt $lastid2 ] ||
1397 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1399 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1400 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1402 wait_update_facet ost1 "$LCTL get_param -n \
1403 obdfilter.${OST_DEV}.lfsck_layout |
1404 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1406 error "(6) unexpected status"
1409 stop ost1 || error "(7) Fail to stop ost1"
1411 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1412 error "(8) Fail to start ost1"
1414 echo "the on-disk LAST_ID should have been rebuilt"
1415 wait_update_facet ost1 "$LCTL get_param -n \
1416 obdfilter.${ost1_svc}.last_id | grep $seq |
1417 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1418 do_facet ost1 $LCTL get_param -n \
1419 obdfilter.${ost1_svc}.last_id
1420 error "(9) expect lastid1 $seq:$lastid1"
1423 do_facet ost1 $LCTL set_param fail_loc=0
1424 stopall || error "(10) Fail to stopall"
1426 run_test 11b "LFSCK can rebuild crashed last_id"
1429 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1431 check_mount_and_prep
1432 for k in $(seq $MDSCOUNT); do
1433 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1434 createmany -o $DIR/$tdir/${k}/f 100 ||
1435 error "(0) Fail to create 100 files."
1438 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1439 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1440 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1442 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1443 wait_all_targets namespace scanning-phase1 3
1445 echo "Stop namespace LFSCK on all targets by single lctl command."
1446 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1447 error "(4) Fail to stop LFSCK on all devices!"
1449 echo "All the LFSCK targets should be in 'stopped' status."
1450 wait_all_targets_blocked namespace stopped 5
1452 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1453 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1454 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1456 echo "All the LFSCK targets should be in 'completed' status."
1457 wait_all_targets_blocked namespace completed 7
1459 start_full_debug_logging
1461 echo "Start layout LFSCK on all targets by single command (-s 1)."
1462 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1463 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1465 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1466 wait_all_targets layout scanning-phase1 9
1468 echo "Stop layout LFSCK on all targets by single lctl command."
1469 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1470 error "(10) Fail to stop LFSCK on all devices!"
1472 echo "All the LFSCK targets should be in 'stopped' status."
1473 wait_all_targets_blocked layout stopped 11
1475 for k in $(seq $OSTCOUNT); do
1476 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1477 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1478 awk '/^status/ { print $2 }')
1479 [ "$STATUS" == "stopped" ] ||
1480 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1483 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1484 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1485 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1487 echo "All the LFSCK targets should be in 'completed' status."
1488 wait_all_targets_blocked layout completed 14
1490 stop_full_debug_logging
1492 run_test 12a "single command to trigger LFSCK on all devices"
1495 check_mount_and_prep
1497 echo "Start LFSCK without '-M' specified."
1498 do_facet mds1 $LCTL lfsck_start -A -r ||
1499 error "(0) Fail to start LFSCK without '-M'"
1501 wait_all_targets_blocked namespace completed 1
1502 wait_all_targets_blocked layout completed 2
1504 local count=$(do_facet mds1 $LCTL dl |
1505 awk '{ print $3 }' | grep mdt | wc -l)
1506 if [ $count -gt 1 ]; then
1508 echo "Start layout LFSCK on the node with multipe targets,"
1509 echo "but not specify '-M'/'-A' option. Should get failure."
1511 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1512 error "(3) Start layout LFSCK should fail" || true
1515 run_test 12b "auto detect Lustre device"
1519 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1520 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1521 echo "MDT-object FID."
1524 check_mount_and_prep
1526 echo "Inject failure stub to simulate bad lmm_oi"
1527 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1528 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1529 createmany -o $DIR/$tdir/f 1
1530 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1531 error "(0) Fail to create PFL $DIR/$tdir/f1"
1532 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1534 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1535 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1537 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1538 mdd.${MDT_DEV}.lfsck_layout |
1539 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1541 error "(2) unexpected status"
1544 local repaired=$($SHOW_LAYOUT |
1545 awk '/^repaired_others/ { print $2 }')
1546 [ $repaired -eq 2 ] ||
1547 error "(3) Fail to repair crashed lmm_oi: $repaired"
1549 run_test 13 "LFSCK can repair crashed lmm_oi"
1553 echo "The OST-object referenced by the MDT-object should be there;"
1554 echo "otherwise, the LFSCK should re-create the missing OST-object."
1555 echo "without '--delay-create-ostobj' option."
1558 check_mount_and_prep
1559 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1561 echo "Inject failure stub to simulate dangling referenced MDT-object"
1562 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1563 do_facet ost1 $LCTL set_param fail_loc=0x1610
1564 local count=$(precreated_ost_obj_count 0 0)
1566 createmany -o $DIR/$tdir/f $((count + 16)) ||
1567 error "(0.1) Fail to create $DIR/$tdir/fx"
1568 touch $DIR/$tdir/guard0
1570 for ((i = 0; i < 16; i++)); do
1571 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1572 $DIR/$tdir/f_comp${i} ||
1573 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1575 touch $DIR/$tdir/guard1
1577 do_facet ost1 $LCTL set_param fail_loc=0
1579 start_full_debug_logging
1581 # exhaust other pre-created dangling cases
1582 count=$(precreated_ost_obj_count 0 0)
1583 createmany -o $DIR/$tdir/a $count ||
1584 error "(0.5) Fail to create $count files."
1586 echo "'ls' should fail because of dangling referenced MDT-object"
1587 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1589 echo "Trigger layout LFSCK to find out dangling reference"
1590 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1593 mdd.${MDT_DEV}.lfsck_layout |
1594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1596 error "(3) unexpected status"
1599 local repaired=$($SHOW_LAYOUT |
1600 awk '/^repaired_dangling/ { print $2 }')
1601 [ $repaired -ge 32 ] ||
1602 error "(4) Fail to repair dangling reference: $repaired"
1604 echo "'stat' should fail because of not repair dangling by default"
1605 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1606 error "(5.1) stat should fail"
1607 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1608 error "(5.2) stat should fail"
1610 echo "Trigger layout LFSCK to repair dangling reference"
1611 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1613 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1614 mdd.${MDT_DEV}.lfsck_layout |
1615 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1617 error "(7) unexpected status"
1620 # There may be some async LFSCK updates in processing, wait for
1621 # a while until the target reparation has been done. LU-4970.
1623 echo "'stat' should success after layout LFSCK repairing"
1624 wait_update_facet client "stat $DIR/$tdir/guard0 |
1625 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1626 stat $DIR/$tdir/guard0
1628 error "(8.1) unexpected size"
1631 wait_update_facet client "stat $DIR/$tdir/guard1 |
1632 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1633 stat $DIR/$tdir/guard1
1635 error "(8.2) unexpected size"
1638 repaired=$($SHOW_LAYOUT |
1639 awk '/^repaired_dangling/ { print $2 }')
1640 [ $repaired -ge 32 ] ||
1641 error "(9) Fail to repair dangling reference: $repaired"
1643 stop_full_debug_logging
1645 echo "stopall to cleanup object cache"
1648 setupall > /dev/null
1650 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1654 echo "The OST-object referenced by the MDT-object should be there;"
1655 echo "otherwise, the LFSCK should re-create the missing OST-object."
1656 echo "with '--delay-create-ostobj' option."
1659 check_mount_and_prep
1660 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1662 echo "Inject failure stub to simulate dangling referenced MDT-object"
1663 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1664 do_facet ost1 $LCTL set_param fail_loc=0x1610
1665 local count=$(precreated_ost_obj_count 0 0)
1667 createmany -o $DIR/$tdir/f $((count + 31))
1668 touch $DIR/$tdir/guard
1669 do_facet ost1 $LCTL set_param fail_loc=0
1671 start_full_debug_logging
1673 # exhaust other pre-created dangling cases
1674 count=$(precreated_ost_obj_count 0 0)
1675 createmany -o $DIR/$tdir/a $count ||
1676 error "(0) Fail to create $count files."
1678 echo "'ls' should fail because of dangling referenced MDT-object"
1679 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1681 echo "Trigger layout LFSCK to find out dangling reference"
1682 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1684 wait_all_targets_blocked layout completed 3
1686 local repaired=$($SHOW_LAYOUT |
1687 awk '/^repaired_dangling/ { print $2 }')
1688 [ $repaired -ge 32 ] ||
1689 error "(4) Fail to repair dangling reference: $repaired"
1691 echo "'stat' should fail because of not repair dangling by default"
1692 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1694 echo "Trigger layout LFSCK to repair dangling reference"
1695 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1697 wait_all_targets_blocked layout completed 7
1699 # There may be some async LFSCK updates in processing, wait for
1700 # a while until the target reparation has been done. LU-4970.
1702 echo "'stat' should success after layout LFSCK repairing"
1703 wait_update_facet client "stat $DIR/$tdir/guard |
1704 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1705 stat $DIR/$tdir/guard
1707 error "(8) unexpected size"
1710 repaired=$($SHOW_LAYOUT |
1711 awk '/^repaired_dangling/ { print $2 }')
1712 [ $repaired -ge 32 ] ||
1713 error "(9) Fail to repair dangling reference: $repaired"
1715 stop_full_debug_logging
1717 echo "stopall to cleanup object cache"
1720 setupall > /dev/null
1722 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1726 echo "If the OST-object referenced by the MDT-object back points"
1727 echo "to some non-exist MDT-object, then the LFSCK should repair"
1728 echo "the OST-object to back point to the right MDT-object."
1731 check_mount_and_prep
1732 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1734 echo "Inject failure stub to make the OST-object to back point to"
1735 echo "non-exist MDT-object."
1736 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1738 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1739 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1740 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1742 error "(0) Fail to create PFL $DIR/$tdir/f1"
1743 # 'dd' will trigger punch RPC firstly on every OST-objects.
1744 # So even though some OST-object will not be write by 'dd',
1745 # as long as it is allocated (may be NOT allocated in pfl_3b)
1746 # its layout information will be set also.
1747 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1748 cancel_lru_locks osc
1749 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1751 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1752 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1754 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1755 mdd.${MDT_DEV}.lfsck_layout |
1756 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1758 error "(2) unexpected status"
1761 local repaired=$($SHOW_LAYOUT |
1762 awk '/^repaired_unmatched_pair/ { print $2 }')
1763 [ $repaired -ge 3 ] ||
1764 error "(3) Fail to repair unmatched pair: $repaired"
1766 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1770 echo "If the OST-object referenced by the MDT-object back points"
1771 echo "to other MDT-object that doesn't recognize the OST-object,"
1772 echo "then the LFSCK should repair it to back point to the right"
1773 echo "MDT-object (the first one)."
1776 check_mount_and_prep
1777 mkdir -p $DIR/$tdir/0
1778 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1779 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1780 cancel_lru_locks osc
1782 echo "Inject failure stub to make the OST-object to back point to"
1783 echo "other MDT-object"
1786 [ $OSTCOUNT -ge 2 ] && stripes=2
1788 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1789 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1790 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1791 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1793 error "(0) Fail to create PFL $DIR/$tdir/f1"
1794 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1795 cancel_lru_locks osc
1796 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1798 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1799 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1801 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1802 mdd.${MDT_DEV}.lfsck_layout |
1803 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1805 error "(2) unexpected status"
1808 local repaired=$($SHOW_LAYOUT |
1809 awk '/^repaired_unmatched_pair/ { print $2 }')
1810 [ $repaired -eq 4 ] ||
1811 error "(3) Fail to repair unmatched pair: $repaired"
1813 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1816 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1818 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1819 skip "Skip the test after 2.7.55 see LU-6437" && return
1822 echo "According to current metadata migration implementation,"
1823 echo "before the old MDT-object is removed, both the new MDT-object"
1824 echo "and old MDT-object will reference the same LOV layout. Then if"
1825 echo "the layout LFSCK finds the new MDT-object by race, it will"
1826 echo "regard related OST-object(s) as multiple referenced case, and"
1827 echo "will try to create new OST-object(s) for the new MDT-object."
1828 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1829 echo "MDT-object before confirm the multiple referenced case."
1832 check_mount_and_prep
1833 $LFS mkdir -i 1 $DIR/$tdir/a1
1834 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1835 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1836 cancel_lru_locks osc
1838 echo "Inject failure stub on MDT1 to delay the migration"
1840 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1841 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1842 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1843 $LFS migrate -m 0 $DIR/$tdir/a1 &
1846 echo "Trigger layout LFSCK to race with the migration"
1847 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1849 wait_all_targets_blocked layout completed 2
1851 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1852 local repaired=$($SHOW_LAYOUT |
1853 awk '/^repaired_unmatched_pair/ { print $2 }')
1854 [ $repaired -eq 1 ] ||
1855 error "(3) Fail to repair unmatched pair: $repaired"
1857 repaired=$($SHOW_LAYOUT |
1858 awk '/^repaired_multiple_referenced/ { print $2 }')
1859 [ $repaired -eq 0 ] ||
1860 error "(4) Unexpectedly repaird multiple references: $repaired"
1862 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1866 echo "If the OST-object's owner information does not match the owner"
1867 echo "information stored in the MDT-object, then the LFSCK trust the"
1868 echo "MDT-object and update the OST-object's owner information."
1871 check_mount_and_prep
1872 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1873 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1874 cancel_lru_locks osc
1876 echo "Inject failure stub to skip OST-object owner changing"
1877 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1878 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1879 chown 1.1 $DIR/$tdir/f0
1880 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1882 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1885 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1887 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1888 mdd.${MDT_DEV}.lfsck_layout |
1889 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1891 error "(2) unexpected status"
1894 local repaired=$($SHOW_LAYOUT |
1895 awk '/^repaired_inconsistent_owner/ { print $2 }')
1896 [ $repaired -eq 1 ] ||
1897 error "(3) Fail to repair inconsistent owner: $repaired"
1899 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1903 echo "If more than one MDT-objects reference the same OST-object,"
1904 echo "and the OST-object only recognizes one MDT-object, then the"
1905 echo "LFSCK should create new OST-objects for such non-recognized"
1909 check_mount_and_prep
1910 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1912 echo "Inject failure stub to make two MDT-objects to refernce"
1913 echo "the OST-object"
1915 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1916 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1917 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1918 cancel_lru_locks mdc
1919 cancel_lru_locks osc
1921 createmany -o $DIR/$tdir/f 1
1922 cancel_lru_locks mdc
1923 cancel_lru_locks osc
1925 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1927 error "(0) Fail to create PFL $DIR/$tdir/f1"
1928 cancel_lru_locks mdc
1929 cancel_lru_locks osc
1930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1932 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1933 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1934 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1935 [ $size -eq 1048576 ] ||
1936 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1938 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1939 [ $size -eq 1048576 ] ||
1940 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1942 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1945 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1947 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1948 mdd.${MDT_DEV}.lfsck_layout |
1949 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1951 error "(3) unexpected status"
1954 local repaired=$($SHOW_LAYOUT |
1955 awk '/^repaired_multiple_referenced/ { print $2 }')
1956 [ $repaired -eq 2 ] ||
1957 error "(4) Fail to repair multiple references: $repaired"
1959 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1960 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1961 error "(5) Fail to write f0."
1962 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1963 [ $size -eq 1048576 ] ||
1964 error "(6) guard size should be 1048576, but got $size"
1966 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1967 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1968 error "(7) Fail to write f1."
1969 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1970 [ $size -eq 1048576 ] ||
1971 error "(8) guard size should be 1048576, but got $size"
1973 run_test 17 "LFSCK can repair multiple references"
1975 $LCTL set_param debug=+cache > /dev/null
1979 echo "The target MDT-object is there, but related stripe information"
1980 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1981 echo "layout EA entries."
1984 check_mount_and_prep
1985 $LFS mkdir -i 0 $DIR/$tdir/a1
1986 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1987 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1989 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1991 $LFS path2fid $DIR/$tdir/a1/f1
1992 $LFS getstripe $DIR/$tdir/a1/f1
1994 if [ $MDSCOUNT -ge 2 ]; then
1995 $LFS mkdir -i 1 $DIR/$tdir/a2
1996 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1997 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1998 $LFS path2fid $DIR/$tdir/a2/f2
1999 $LFS getstripe $DIR/$tdir/a2/f2
2002 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2003 error "(0) Fail to create PFL $DIR/$tdir/f3"
2005 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2007 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2009 $LFS path2fid $DIR/$tdir/f3
2010 $LFS getstripe $DIR/$tdir/f3
2012 cancel_lru_locks osc
2014 echo "Inject failure, to make the MDT-object lost its layout EA"
2015 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2016 do_facet mds1 $LCTL set_param fail_loc=0x1615
2017 chown 1.1 $DIR/$tdir/a1/f1
2019 if [ $MDSCOUNT -ge 2 ]; then
2020 do_facet mds2 $LCTL set_param fail_loc=0x1615
2021 chown 1.1 $DIR/$tdir/a2/f2
2024 chown 1.1 $DIR/$tdir/f3
2029 do_facet mds1 $LCTL set_param fail_loc=0
2030 if [ $MDSCOUNT -ge 2 ]; then
2031 do_facet mds2 $LCTL set_param fail_loc=0
2034 cancel_lru_locks mdc
2035 cancel_lru_locks osc
2037 echo "The file size should be incorrect since layout EA is lost"
2038 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2039 [ "$cur_size" != "$saved_size1" ] ||
2040 error "(1) Expect incorrect file1 size"
2042 if [ $MDSCOUNT -ge 2 ]; then
2043 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2044 [ "$cur_size" != "$saved_size1" ] ||
2045 error "(2) Expect incorrect file2 size"
2048 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2049 [ "$cur_size" != "$saved_size2" ] ||
2050 error "(1.2) Expect incorrect file3 size"
2052 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2053 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2055 for k in $(seq $MDSCOUNT); do
2056 # The LFSCK status query internal is 30 seconds. For the case
2057 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2058 # time to guarantee the status sync up.
2059 wait_update_facet mds${k} "$LCTL get_param -n \
2060 mdd.$(facet_svc mds${k}).lfsck_layout |
2061 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2062 error "(4) MDS${k} is not the expected 'completed'"
2065 for k in $(seq $OSTCOUNT); do
2066 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2067 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2068 awk '/^status/ { print $2 }')
2069 [ "$cur_status" == "completed" ] ||
2070 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2073 local repaired=$(do_facet mds1 $LCTL get_param -n \
2074 mdd.$(facet_svc mds1).lfsck_layout |
2075 awk '/^repaired_orphan/ { print $2 }')
2076 [ $repaired -eq 3 ] ||
2077 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2079 if [ $MDSCOUNT -ge 2 ]; then
2080 repaired=$(do_facet mds2 $LCTL get_param -n \
2081 mdd.$(facet_svc mds2).lfsck_layout |
2082 awk '/^repaired_orphan/ { print $2 }')
2083 [ $repaired -eq 2 ] ||
2084 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2087 $LFS path2fid $DIR/$tdir/a1/f1
2088 $LFS getstripe $DIR/$tdir/a1/f1
2090 if [ $MDSCOUNT -ge 2 ]; then
2091 $LFS path2fid $DIR/$tdir/a2/f2
2092 $LFS getstripe $DIR/$tdir/a2/f2
2095 $LFS path2fid $DIR/$tdir/f3
2096 $LFS getstripe $DIR/$tdir/f3
2098 echo "The file size should be correct after layout LFSCK scanning"
2099 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2100 [ "$cur_size" == "$saved_size1" ] ||
2101 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2103 if [ $MDSCOUNT -ge 2 ]; then
2104 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2105 [ "$cur_size" == "$saved_size1" ] ||
2106 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2109 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2110 [ "$cur_size" == "$saved_size2" ] ||
2111 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2113 run_test 18a "Find out orphan OST-object and repair it (1)"
2117 echo "The target MDT-object is lost. The LFSCK should re-create the"
2118 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2119 echo "can move it back to normal namespace manually."
2122 check_mount_and_prep
2123 $LFS mkdir -i 0 $DIR/$tdir/a1
2124 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2125 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2126 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2127 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2129 $LFS getstripe $DIR/$tdir/a1/f1
2131 if [ $MDSCOUNT -ge 2 ]; then
2132 $LFS mkdir -i 1 $DIR/$tdir/a2
2133 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2134 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2135 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2137 $LFS getstripe $DIR/$tdir/a2/f2
2140 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2141 error "(0) Fail to create PFL $DIR/$tdir/f3"
2143 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2145 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2146 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2148 $LFS getstripe $DIR/$tdir/f3
2150 cancel_lru_locks osc
2152 echo "Inject failure, to simulate the case of missing the MDT-object"
2153 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2154 do_facet mds1 $LCTL set_param fail_loc=0x1616
2155 rm -f $DIR/$tdir/a1/f1
2157 if [ $MDSCOUNT -ge 2 ]; then
2158 do_facet mds2 $LCTL set_param fail_loc=0x1616
2159 rm -f $DIR/$tdir/a2/f2
2167 do_facet mds1 $LCTL set_param fail_loc=0
2168 if [ $MDSCOUNT -ge 2 ]; then
2169 do_facet mds2 $LCTL set_param fail_loc=0
2172 cancel_lru_locks mdc
2173 cancel_lru_locks osc
2175 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2176 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2178 for k in $(seq $MDSCOUNT); do
2179 # The LFSCK status query internal is 30 seconds. For the case
2180 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2181 # time to guarantee the status sync up.
2182 wait_update_facet mds${k} "$LCTL get_param -n \
2183 mdd.$(facet_svc mds${k}).lfsck_layout |
2184 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2185 error "(2) MDS${k} is not the expected 'completed'"
2188 for k in $(seq $OSTCOUNT); do
2189 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2190 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2191 awk '/^status/ { print $2 }')
2192 [ "$cur_status" == "completed" ] ||
2193 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2196 local repaired=$(do_facet mds1 $LCTL get_param -n \
2197 mdd.$(facet_svc mds1).lfsck_layout |
2198 awk '/^repaired_orphan/ { print $2 }')
2199 [ $repaired -eq 3 ] ||
2200 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2202 if [ $MDSCOUNT -ge 2 ]; then
2203 repaired=$(do_facet mds2 $LCTL get_param -n \
2204 mdd.$(facet_svc mds2).lfsck_layout |
2205 awk '/^repaired_orphan/ { print $2 }')
2206 [ $repaired -eq 2 ] ||
2207 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2210 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2211 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2212 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2214 if [ $MDSCOUNT -ge 2 ]; then
2215 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2216 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2219 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2220 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2222 $LFS path2fid $DIR/$tdir/a1/f1
2223 $LFS getstripe $DIR/$tdir/a1/f1
2225 if [ $MDSCOUNT -ge 2 ]; then
2226 $LFS path2fid $DIR/$tdir/a2/f2
2227 $LFS getstripe $DIR/$tdir/a2/f2
2230 $LFS path2fid $DIR/$tdir/f3
2231 $LFS getstripe $DIR/$tdir/f3
2233 echo "The file size should be correct after layout LFSCK scanning"
2234 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2235 [ "$cur_size" == "$saved_size1" ] ||
2236 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2238 if [ $MDSCOUNT -ge 2 ]; then
2239 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2240 [ "$cur_size" == "$saved_size1" ] ||
2241 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2244 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2245 [ "$cur_size" == "$saved_size2" ] ||
2246 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2248 run_test 18b "Find out orphan OST-object and repair it (2)"
2252 echo "The target MDT-object is lost, and the OST-object FID is missing."
2253 echo "The LFSCK should re-create the MDT-object with new FID under the "
2254 echo "directory .lustre/lost+found/MDTxxxx."
2257 check_mount_and_prep
2258 $LFS mkdir -i 0 $DIR/$tdir/a1
2259 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2261 echo "Inject failure, to simulate the case of missing parent FID"
2262 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2263 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2265 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2266 $LFS getstripe $DIR/$tdir/a1/f1
2268 if [ $MDSCOUNT -ge 2 ]; then
2269 $LFS mkdir -i 1 $DIR/$tdir/a2
2270 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2271 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2272 $LFS getstripe $DIR/$tdir/a2/f2
2275 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2276 error "(0) Fail to create PFL $DIR/$tdir/f3"
2278 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2279 $LFS getstripe $DIR/$tdir/f3
2281 cancel_lru_locks osc
2282 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2284 echo "Inject failure, to simulate the case of missing the MDT-object"
2285 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2286 do_facet mds1 $LCTL set_param fail_loc=0x1616
2287 rm -f $DIR/$tdir/a1/f1
2289 if [ $MDSCOUNT -ge 2 ]; then
2290 do_facet mds2 $LCTL set_param fail_loc=0x1616
2291 rm -f $DIR/$tdir/a2/f2
2299 do_facet mds1 $LCTL set_param fail_loc=0
2300 if [ $MDSCOUNT -ge 2 ]; then
2301 do_facet mds2 $LCTL set_param fail_loc=0
2304 cancel_lru_locks mdc
2305 cancel_lru_locks osc
2307 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2308 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2310 for k in $(seq $MDSCOUNT); do
2311 # The LFSCK status query internal is 30 seconds. For the case
2312 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2313 # time to guarantee the status sync up.
2314 wait_update_facet mds${k} "$LCTL get_param -n \
2315 mdd.$(facet_svc mds${k}).lfsck_layout |
2316 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2317 error "(2) MDS${k} is not the expected 'completed'"
2320 for k in $(seq $OSTCOUNT); do
2321 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2322 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2323 awk '/^status/ { print $2 }')
2324 [ "$cur_status" == "completed" ] ||
2325 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2328 if [ $MDSCOUNT -ge 2 ]; then
2334 local repaired=$(do_facet mds1 $LCTL get_param -n \
2335 mdd.$(facet_svc mds1).lfsck_layout |
2336 awk '/^repaired_orphan/ { print $2 }')
2337 [ $repaired -eq $expected ] ||
2338 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2340 if [ $MDSCOUNT -ge 2 ]; then
2341 repaired=$(do_facet mds2 $LCTL get_param -n \
2342 mdd.$(facet_svc mds2).lfsck_layout |
2343 awk '/^repaired_orphan/ { print $2 }')
2344 [ $repaired -eq 0 ] ||
2345 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2348 ls -ail $MOUNT/.lustre/lost+found/
2350 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2351 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2352 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2354 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2357 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2358 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2359 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2361 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2362 [ ! -z "$cname" ] ||
2363 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2365 run_test 18c "Find out orphan OST-object and repair it (3)"
2369 echo "The target MDT-object layout EA is corrupted, but the right"
2370 echo "OST-object is still alive as orphan. The layout LFSCK will"
2371 echo "not create new OST-object to occupy such slot."
2374 check_mount_and_prep
2376 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2377 echo "guard" > $DIR/$tdir/a1/f1
2378 echo "foo" > $DIR/$tdir/a1/f2
2380 echo "guard" > $DIR/$tdir/a1/f3
2381 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2382 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2383 echo "foo" > $DIR/$tdir/a1/f4
2385 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2386 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2387 $LFS path2fid $DIR/$tdir/a1/f1
2388 $LFS getstripe $DIR/$tdir/a1/f1
2389 $LFS path2fid $DIR/$tdir/a1/f2
2390 $LFS getstripe $DIR/$tdir/a1/f2
2391 $LFS path2fid $DIR/$tdir/a1/f3
2392 $LFS getstripe $DIR/$tdir/a1/f3
2393 $LFS path2fid $DIR/$tdir/a1/f4
2394 $LFS getstripe $DIR/$tdir/a1/f4
2395 cancel_lru_locks osc
2397 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2398 echo "to reference the same OST-object (which is f1's OST-obejct)."
2399 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2400 echo "dangling reference case, but f2's old OST-object is there."
2402 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2403 echo "to reference the same OST-object (which is f3's OST-obejct)."
2404 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2405 echo "dangling reference case, but f4's old OST-object is there."
2408 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2409 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2410 chown 1.1 $DIR/$tdir/a1/f2
2411 chown 1.1 $DIR/$tdir/a1/f4
2412 rm -f $DIR/$tdir/a1/f1
2413 rm -f $DIR/$tdir/a1/f3
2416 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2418 echo "stopall to cleanup object cache"
2421 setupall > /dev/null
2423 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2424 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2426 for k in $(seq $MDSCOUNT); do
2427 # The LFSCK status query internal is 30 seconds. For the case
2428 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2429 # time to guarantee the status sync up.
2430 wait_update_facet mds${k} "$LCTL get_param -n \
2431 mdd.$(facet_svc mds${k}).lfsck_layout |
2432 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2433 error "(3) MDS${k} is not the expected 'completed'"
2436 for k in $(seq $OSTCOUNT); do
2437 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2438 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2439 awk '/^status/ { print $2 }')
2440 [ "$cur_status" == "completed" ] ||
2441 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2444 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2445 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2446 awk '/^repaired_orphan/ { print $2 }')
2447 [ $repaired -eq 2 ] ||
2448 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2450 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2451 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2452 awk '/^repaired_dangling/ { print $2 }')
2453 [ $repaired -eq 0 ] ||
2454 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2456 echo "The file size should be correct after layout LFSCK scanning"
2457 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2458 [ "$cur_size" == "$saved_size1" ] ||
2459 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2461 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2462 [ "$cur_size" == "$saved_size2" ] ||
2463 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2465 echo "The LFSCK should find back the original data."
2466 cat $DIR/$tdir/a1/f2
2467 $LFS path2fid $DIR/$tdir/a1/f2
2468 $LFS getstripe $DIR/$tdir/a1/f2
2469 cat $DIR/$tdir/a1/f4
2470 $LFS path2fid $DIR/$tdir/a1/f4
2471 $LFS getstripe $DIR/$tdir/a1/f4
2473 run_test 18d "Find out orphan OST-object and repair it (4)"
2477 echo "The target MDT-object layout EA slot is occpuied by some new"
2478 echo "created OST-object when repair dangling reference case. Such"
2479 echo "conflict OST-object has been modified by others. To keep the"
2480 echo "new data, the LFSCK will create a new file to refernece this"
2481 echo "old orphan OST-object."
2484 check_mount_and_prep
2486 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2487 echo "guard" > $DIR/$tdir/a1/f1
2488 echo "foo" > $DIR/$tdir/a1/f2
2490 echo "guard" > $DIR/$tdir/a1/f3
2491 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2492 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2493 echo "foo" > $DIR/$tdir/a1/f4
2495 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2496 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2498 $LFS path2fid $DIR/$tdir/a1/f1
2499 $LFS getstripe $DIR/$tdir/a1/f1
2500 $LFS path2fid $DIR/$tdir/a1/f2
2501 $LFS getstripe $DIR/$tdir/a1/f2
2502 $LFS path2fid $DIR/$tdir/a1/f3
2503 $LFS getstripe $DIR/$tdir/a1/f3
2504 $LFS path2fid $DIR/$tdir/a1/f4
2505 $LFS getstripe $DIR/$tdir/a1/f4
2506 cancel_lru_locks osc
2508 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2509 echo "to reference the same OST-object (which is f1's OST-obejct)."
2510 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2511 echo "dangling reference case, but f2's old OST-object is there."
2513 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2514 echo "to reference the same OST-object (which is f3's OST-obejct)."
2515 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2516 echo "dangling reference case, but f4's old OST-object is there."
2519 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2521 chown 1.1 $DIR/$tdir/a1/f2
2522 chown 1.1 $DIR/$tdir/a1/f4
2523 rm -f $DIR/$tdir/a1/f1
2524 rm -f $DIR/$tdir/a1/f3
2527 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2529 echo "stopall to cleanup object cache"
2532 setupall > /dev/null
2534 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2535 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2537 start_full_debug_logging
2539 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2540 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2542 wait_update_facet mds1 "$LCTL get_param -n \
2543 mdd.$(facet_svc mds1).lfsck_layout |
2544 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2545 error "(3) MDS1 is not the expected 'scanning-phase2'"
2547 # to guarantee all updates are synced.
2551 echo "Write new data to f2/f4 to modify the new created OST-object."
2552 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2553 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2555 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2557 for k in $(seq $MDSCOUNT); do
2558 # The LFSCK status query internal is 30 seconds. For the case
2559 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2560 # time to guarantee the status sync up.
2561 wait_update_facet mds${k} "$LCTL get_param -n \
2562 mdd.$(facet_svc mds${k}).lfsck_layout |
2563 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2564 error "(4) MDS${k} is not the expected 'completed'"
2567 for k in $(seq $OSTCOUNT); do
2568 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2569 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2570 awk '/^status/ { print $2 }')
2571 [ "$cur_status" == "completed" ] ||
2572 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2575 stop_full_debug_logging
2577 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2578 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2579 awk '/^repaired_orphan/ { print $2 }')
2580 [ $repaired -eq 2 ] ||
2581 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2583 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2584 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2585 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2587 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2588 if [ $count -ne 2 ]; then
2589 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2590 error "(8) Expect 2 stubs under lost+found, but got $count"
2593 echo "The stub file should keep the original f2 or f4 data"
2594 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2595 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2596 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2597 error "(9) Got unexpected $cur_size"
2600 $LFS path2fid $cname
2601 $LFS getstripe $cname
2603 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2604 cur_size=$(ls -il $cname | awk '{ print $6 }')
2605 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2606 error "(10) Got unexpected $cur_size"
2609 $LFS path2fid $cname
2610 $LFS getstripe $cname
2612 echo "The f2/f4 should contains new data."
2613 cat $DIR/$tdir/a1/f2
2614 $LFS path2fid $DIR/$tdir/a1/f2
2615 $LFS getstripe $DIR/$tdir/a1/f2
2616 cat $DIR/$tdir/a1/f4
2617 $LFS path2fid $DIR/$tdir/a1/f4
2618 $LFS getstripe $DIR/$tdir/a1/f4
2620 run_test 18e "Find out orphan OST-object and repair it (5)"
2623 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2626 echo "The target MDT-object is lost. The LFSCK should re-create the"
2627 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2628 echo "to verify some OST-object(s) during the first stage-scanning,"
2629 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2630 echo "should not be affected."
2633 check_mount_and_prep
2634 $LFS mkdir -i 0 $DIR/$tdir/a1
2635 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2636 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2637 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2638 $LFS mkdir -i 0 $DIR/$tdir/a2
2639 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2640 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2641 $LFS getstripe $DIR/$tdir/a1/f1
2642 $LFS getstripe $DIR/$tdir/a2/f2
2644 if [ $MDSCOUNT -ge 2 ]; then
2645 $LFS mkdir -i 1 $DIR/$tdir/a3
2646 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2647 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2648 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2649 $LFS mkdir -i 1 $DIR/$tdir/a4
2650 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2651 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2652 $LFS getstripe $DIR/$tdir/a3/f3
2653 $LFS getstripe $DIR/$tdir/a4/f4
2656 cancel_lru_locks osc
2658 echo "Inject failure, to simulate the case of missing the MDT-object"
2659 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2660 do_facet mds1 $LCTL set_param fail_loc=0x1616
2661 rm -f $DIR/$tdir/a1/f1
2662 rm -f $DIR/$tdir/a2/f2
2664 if [ $MDSCOUNT -ge 2 ]; then
2665 do_facet mds2 $LCTL set_param fail_loc=0x1616
2666 rm -f $DIR/$tdir/a3/f3
2667 rm -f $DIR/$tdir/a4/f4
2673 do_facet mds1 $LCTL set_param fail_loc=0
2674 if [ $MDSCOUNT -ge 2 ]; then
2675 do_facet mds2 $LCTL set_param fail_loc=0
2678 cancel_lru_locks mdc
2679 cancel_lru_locks osc
2681 echo "Inject failure, to simulate the OST0 fail to handle"
2682 echo "MDT0 LFSCK request during the first-stage scanning."
2683 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2684 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2686 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2687 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2689 for k in $(seq $MDSCOUNT); do
2690 # The LFSCK status query internal is 30 seconds. For the case
2691 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2692 # time to guarantee the status sync up.
2693 wait_update_facet mds${k} "$LCTL get_param -n \
2694 mdd.$(facet_svc mds${k}).lfsck_layout |
2695 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2696 error "(2) MDS${k} is not the expected 'partial'"
2699 wait_update_facet ost1 "$LCTL get_param -n \
2700 obdfilter.$(facet_svc ost1).lfsck_layout |
2701 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2702 error "(3) OST1 is not the expected 'partial'"
2705 wait_update_facet ost2 "$LCTL get_param -n \
2706 obdfilter.$(facet_svc ost2).lfsck_layout |
2707 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2708 error "(4) OST2 is not the expected 'completed'"
2711 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2713 local repaired=$(do_facet mds1 $LCTL get_param -n \
2714 mdd.$(facet_svc mds1).lfsck_layout |
2715 awk '/^repaired_orphan/ { print $2 }')
2716 [ $repaired -eq 1 ] ||
2717 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2719 if [ $MDSCOUNT -ge 2 ]; then
2720 repaired=$(do_facet mds2 $LCTL get_param -n \
2721 mdd.$(facet_svc mds2).lfsck_layout |
2722 awk '/^repaired_orphan/ { print $2 }')
2723 [ $repaired -eq 1 ] ||
2724 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2727 echo "Trigger layout LFSCK on all devices again to cleanup"
2728 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2730 for k in $(seq $MDSCOUNT); do
2731 # The LFSCK status query internal is 30 seconds. For the case
2732 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2733 # time to guarantee the status sync up.
2734 wait_update_facet mds${k} "$LCTL get_param -n \
2735 mdd.$(facet_svc mds${k}).lfsck_layout |
2736 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2737 error "(8) MDS${k} is not the expected 'completed'"
2740 for k in $(seq $OSTCOUNT); do
2741 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2742 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2743 awk '/^status/ { print $2 }')
2744 [ "$cur_status" == "completed" ] ||
2745 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2749 local repaired=$(do_facet mds1 $LCTL get_param -n \
2750 mdd.$(facet_svc mds1).lfsck_layout |
2751 awk '/^repaired_orphan/ { print $2 }')
2752 [ $repaired -eq 2 ] ||
2753 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2755 if [ $MDSCOUNT -ge 2 ]; then
2756 repaired=$(do_facet mds2 $LCTL get_param -n \
2757 mdd.$(facet_svc mds2).lfsck_layout |
2758 awk '/^repaired_orphan/ { print $2 }')
2759 [ $repaired -eq 2 ] ||
2760 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2763 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2767 echo "The target MDT-object is lost, but related OI mapping is there"
2768 echo "The LFSCK should recreate the lost MDT-object without affected"
2769 echo "by the stale OI mapping."
2772 check_mount_and_prep
2773 $LFS mkdir -i 0 $DIR/$tdir/a1
2774 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2775 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2776 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2778 $LFS getstripe $DIR/$tdir/a1/f1
2779 cancel_lru_locks osc
2781 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2782 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2783 do_facet mds1 $LCTL set_param fail_loc=0x162e
2784 rm -f $DIR/$tdir/a1/f1
2786 do_facet mds1 $LCTL set_param fail_loc=0
2787 cancel_lru_locks mdc
2788 cancel_lru_locks osc
2790 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2791 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2793 for k in $(seq $MDSCOUNT); do
2794 # The LFSCK status query internal is 30 seconds. For the case
2795 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2796 # time to guarantee the status sync up.
2797 wait_update_facet mds${k} "$LCTL get_param -n \
2798 mdd.$(facet_svc mds${k}).lfsck_layout |
2799 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2800 error "(2) MDS${k} is not the expected 'completed'"
2803 for k in $(seq $OSTCOUNT); do
2804 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2805 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2806 awk '/^status/ { print $2 }')
2807 [ "$cur_status" == "completed" ] ||
2808 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2811 local repaired=$(do_facet mds1 $LCTL get_param -n \
2812 mdd.$(facet_svc mds1).lfsck_layout |
2813 awk '/^repaired_orphan/ { print $2 }')
2814 [ $repaired -eq $OSTCOUNT ] ||
2815 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2817 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2818 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2819 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2821 $LFS path2fid $DIR/$tdir/a1/f1
2822 $LFS getstripe $DIR/$tdir/a1/f1
2824 run_test 18g "Find out orphan OST-object and repair it (7)"
2828 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2829 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2830 echo "scanning its OST-object(s). Then in the second stage scanning,"
2831 echo "the OST will return related OST-object(s) to the MDT as orphan."
2832 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2833 echo "the 'orphan(s)' stripe information."
2836 check_mount_and_prep
2838 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2839 error "(0) Fail to create PFL $DIR/$tdir/f0"
2841 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2842 error "(1.1) Fail to write $DIR/$tdir/f0"
2844 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2845 error "(1.2) Fail to write $DIR/$tdir/f0"
2847 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2849 echo "Inject failure stub to simulate bad PFL extent range"
2850 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2853 chown 1.1 $DIR/$tdir/f0
2855 cancel_lru_locks mdc
2856 cancel_lru_locks osc
2857 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2859 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2860 error "(2) Write to bad PFL file should fail"
2862 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2863 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2865 for k in $(seq $MDSCOUNT); do
2866 # The LFSCK status query internal is 30 seconds. For the case
2867 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2868 # time to guarantee the status sync up.
2869 wait_update_facet mds${k} "$LCTL get_param -n \
2870 mdd.$(facet_svc mds${k}).lfsck_layout |
2871 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2872 error "(4.1) MDS${k} is not the expected 'completed'"
2875 for k in $(seq $OSTCOUNT); do
2876 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2877 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2878 awk '/^status/ { print $2 }')
2879 [ "$cur_status" == "completed" ] ||
2880 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2884 local repaired=$($SHOW_LAYOUT |
2885 awk '/^repaired_orphan/ { print $2 }')
2886 [ $repaired -eq 2 ] ||
2887 error "(5) Fail to repair crashed PFL range: $repaired"
2889 echo "Data in $DIR/$tdir/f0 should not be broken"
2890 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2891 error "(6) Data in $DIR/$tdir/f0 is broken"
2893 echo "Write should succeed after LFSCK repairing the bad PFL range"
2894 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2895 error "(7) Write should succeed after LFSCK"
2897 run_test 18h "LFSCK can repair crashed PFL extent range"
2899 $LCTL set_param debug=-cache > /dev/null
2902 check_mount_and_prep
2903 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2905 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2906 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2908 echo "foo1" > $DIR/$tdir/a0
2909 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2910 error "(0) Fail to create PFL $DIR/$tdir/a1"
2911 echo "foo2" > $DIR/$tdir/a1
2912 echo "guard" > $DIR/$tdir/a2
2913 cancel_lru_locks osc
2915 echo "Inject failure, then client will offer wrong parent FID when read"
2916 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2917 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2919 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2920 $LCTL set_param fail_loc=0x1619
2922 echo "Read RPC with wrong parent FID should be denied"
2923 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2924 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2925 $LCTL set_param fail_loc=0
2927 run_test 19a "OST-object inconsistency self detect"
2930 check_mount_and_prep
2931 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2933 echo "Inject failure stub to make the OST-object to back point to"
2934 echo "non-exist MDT-object"
2936 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2937 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2939 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2940 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2941 echo "foo1" > $DIR/$tdir/f0
2942 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2943 error "(0) Fail to create PFL $DIR/$tdir/f1"
2944 echo "foo2" > $DIR/$tdir/f1
2945 cancel_lru_locks osc
2946 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2948 do_facet ost1 $LCTL set_param -n \
2949 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2950 echo "Nothing should be fixed since self detect and repair is disabled"
2951 local repaired=$(do_facet ost1 $LCTL get_param -n \
2952 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2953 awk '/^repaired/ { print $2 }')
2954 [ $repaired -eq 0 ] ||
2955 error "(1) Expected 0 repaired, but got $repaired"
2957 echo "Read RPC with right parent FID should be accepted,"
2958 echo "and cause parent FID on OST to be fixed"
2960 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2961 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2963 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2964 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2966 repaired=$(do_facet ost1 $LCTL get_param -n \
2967 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2968 awk '/^repaired/ { print $2 }')
2969 [ $repaired -eq 2 ] ||
2970 error "(3) Expected 1 repaired, but got $repaired"
2972 run_test 19b "OST-object inconsistency self repair"
2974 PATTERN_WITH_HOLE="40000001"
2975 PATTERN_WITHOUT_HOLE="raid0"
2978 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2981 echo "The target MDT-object and some of its OST-object are lost."
2982 echo "The LFSCK should find out the left OST-objects and re-create"
2983 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2984 echo "with the partial OST-objects (LOV EA hole)."
2986 echo "New client can access the file with LOV EA hole via normal"
2987 echo "system tools or commands without crash the system."
2989 echo "For old client, even though it cannot access the file with"
2990 echo "LOV EA hole, it should not cause the system crash."
2993 check_mount_and_prep
2994 $LFS mkdir -i 0 $DIR/$tdir/a1
2995 if [ $OSTCOUNT -gt 2 ]; then
2996 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2999 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3003 # 256 blocks on the stripe0.
3004 # 1 block on the stripe1 for 2 OSTs case.
3005 # 256 blocks on the stripe1 for other cases.
3006 # 1 block on the stripe2 if OSTs > 2
3007 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3008 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3009 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3011 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3012 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3013 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3016 $LFS getstripe $DIR/$tdir/a1/f0
3018 $LFS getstripe $DIR/$tdir/a1/f1
3020 $LFS getstripe $DIR/$tdir/a1/f2
3022 if [ $OSTCOUNT -gt 2 ]; then
3023 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3024 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3026 $LFS getstripe $DIR/$tdir/a1/f3
3029 cancel_lru_locks osc
3031 echo "Inject failure..."
3032 echo "To simulate f0 lost MDT-object"
3033 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3034 do_facet mds1 $LCTL set_param fail_loc=0x1616
3035 rm -f $DIR/$tdir/a1/f0
3037 echo "To simulate f1 lost MDT-object and OST-object0"
3038 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3039 do_facet mds1 $LCTL set_param fail_loc=0x161a
3040 rm -f $DIR/$tdir/a1/f1
3042 echo "To simulate f2 lost MDT-object and OST-object1"
3043 do_facet mds1 $LCTL set_param fail_val=1
3044 rm -f $DIR/$tdir/a1/f2
3046 if [ $OSTCOUNT -gt 2 ]; then
3047 echo "To simulate f3 lost MDT-object and OST-object2"
3048 do_facet mds1 $LCTL set_param fail_val=2
3049 rm -f $DIR/$tdir/a1/f3
3052 umount_client $MOUNT
3055 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3057 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3058 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3060 for k in $(seq $MDSCOUNT); do
3061 # The LFSCK status query internal is 30 seconds. For the case
3062 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3063 # time to guarantee the status sync up.
3064 wait_update_facet mds${k} "$LCTL get_param -n \
3065 mdd.$(facet_svc mds${k}).lfsck_layout |
3066 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3067 error "(2) MDS${k} is not the expected 'completed'"
3070 for k in $(seq $OSTCOUNT); do
3071 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3072 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3073 awk '/^status/ { print $2 }')
3074 [ "$cur_status" == "completed" ] ||
3075 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3078 local repaired=$(do_facet mds1 $LCTL get_param -n \
3079 mdd.$(facet_svc mds1).lfsck_layout |
3080 awk '/^repaired_orphan/ { print $2 }')
3081 if [ $OSTCOUNT -gt 2 ]; then
3082 [ $repaired -eq 9 ] ||
3083 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3085 [ $repaired -eq 4 ] ||
3086 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3089 mount_client $MOUNT || error "(5.0) Fail to start client!"
3091 LOV_PATTERN_F_HOLE=0x40000000
3094 # ${fid0}-R-0 is the old f0
3096 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3097 echo "Check $name, which is the old f0"
3099 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3101 local pattern=$($LFS getstripe -L $name)
3102 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3103 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3105 local stripes=$($LFS getstripe -c $name)
3106 if [ $OSTCOUNT -gt 2 ]; then
3107 [ $stripes -eq 3 ] ||
3108 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3110 [ $stripes -eq 2 ] ||
3111 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3114 local size=$(stat $name | awk '/Size:/ { print $2 }')
3115 [ $size -eq $((4096 * $bcount)) ] ||
3116 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3118 cat $name > /dev/null || error "(5.5) cannot read $name"
3120 echo "dummy" >> $name || error "(5.6) cannot write $name"
3122 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3124 touch $name || error "(5.8) cannot touch $name"
3126 rm -f $name || error "(5.9) cannot unlink $name"
3129 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3131 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3132 if [ $OSTCOUNT -gt 2 ]; then
3133 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3135 echo "Check $name, it contains the old f1's stripe1"
3138 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3140 pattern=$($LFS getstripe -L $name)
3141 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3142 error "(6.2) expect pattern flag hole, but got $pattern"
3144 stripes=$($LFS getstripe -c $name)
3145 if [ $OSTCOUNT -gt 2 ]; then
3146 [ $stripes -eq 3 ] ||
3147 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3149 [ $stripes -eq 2 ] ||
3150 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3153 size=$(stat $name | awk '/Size:/ { print $2 }')
3154 [ $size -eq $((4096 * $bcount)) ] ||
3155 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3157 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3159 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3160 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3163 [ $failures -eq 256 ] ||
3164 error "(6.6) expect 256 IO failures, but get $failures"
3166 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3167 [ $size -eq $((4096 * $bcount)) ] ||
3168 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3170 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3171 error "(6.8) write to the LOV EA hole should fail"
3173 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3174 error "(6.9) write to normal stripe should NOT fail"
3176 echo "foo" >> $name && error "(6.10) append write $name should fail"
3178 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3180 touch $name || error "(6.12) cannot touch $name"
3182 rm -f $name || error "(6.13) cannot unlink $name"
3185 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3187 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3188 if [ $OSTCOUNT -gt 2 ]; then
3189 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3191 echo "Check $name, it contains the old f2's stripe0"
3194 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3196 pattern=$($LFS getstripe -L $name)
3197 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3198 error "(7.2) expect pattern flag hole, but got $pattern"
3200 stripes=$($LFS getstripe -c $name)
3201 size=$(stat $name | awk '/Size:/ { print $2 }')
3202 if [ $OSTCOUNT -gt 2 ]; then
3203 [ $stripes -eq 3 ] ||
3204 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3206 [ $size -eq $((4096 * $bcount)) ] ||
3207 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3209 cat $name > /dev/null &&
3210 error "(7.5.1) normal read $name should fail"
3212 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3213 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3215 [ $failures -eq 256 ] ||
3216 error "(7.6) expect 256 IO failures, but get $failures"
3218 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3219 [ $size -eq $((4096 * $bcount)) ] ||
3220 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3222 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3223 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3225 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3226 error "(7.8.1) write to normal stripe should NOT fail"
3228 echo "foo" >> $name &&
3229 error "(7.8.3) append write $name should fail"
3231 chown $RUNAS_ID:$RUNAS_GID $name ||
3232 error "(7.9.1) cannot chown on $name"
3234 touch $name || error "(7.10.1) cannot touch $name"
3236 [ $stripes -eq 2 ] ||
3237 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3240 [ $size -eq $((4096 * (256 + 0))) ] ||
3241 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3243 cat $name > /dev/null &&
3244 error "(7.5.2) normal read $name should fail"
3246 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3247 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3248 [ $failures -eq 256 ] ||
3249 error "(7.6.2) expect 256 IO failures, but get $failures"
3252 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3253 [ $size -eq $((4096 * $bcount)) ] ||
3254 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3256 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3257 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3259 chown $RUNAS_ID:$RUNAS_GID $name ||
3260 error "(7.9.2) cannot chown on $name"
3262 touch $name || error "(7.10.2) cannot touch $name"
3265 rm -f $name || error "(7.11) cannot unlink $name"
3267 [ $OSTCOUNT -le 2 ] && return
3270 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3272 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3273 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3275 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3277 pattern=$($LFS getstripe -L $name)
3278 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3279 error "(8.2) expect pattern flag hole, but got $pattern"
3281 stripes=$($LFS getstripe -c $name)
3282 [ $stripes -eq 3 ] ||
3283 error "(8.3) expect the stripe count is 3, but got $stripes"
3285 size=$(stat $name | awk '/Size:/ { print $2 }')
3287 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3288 error "(8.4) expect the size $((4096 * 512)), but got $size"
3290 cat $name > /dev/null &&
3291 error "(8.5) normal read $name should fail"
3293 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3294 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3296 [ $failures -eq 256 ] ||
3297 error "(8.6) expect 256 IO failures, but get $failures"
3300 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3301 [ $size -eq $((4096 * $bcount)) ] ||
3302 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3304 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3305 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3307 chown $RUNAS_ID:$RUNAS_GID $name ||
3308 error "(8.9) cannot chown on $name"
3310 touch $name || error "(8.10) cannot touch $name"
3312 rm -f $name || error "(8.11) cannot unlink $name"
3314 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3317 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3320 echo "The target MDT-object and some of its OST-object are lost."
3321 echo "The LFSCK should find out the left OST-objects and re-create"
3322 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3323 echo "with the partial OST-objects (LOV EA hole)."
3325 echo "New client can access the file with LOV EA hole via normal"
3326 echo "system tools or commands without crash the system - PFL case."
3329 check_mount_and_prep
3331 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3332 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3333 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3334 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3335 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3336 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3338 local bcount=$((256 * 3 + 1))
3340 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3341 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3342 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3344 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3345 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3346 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3349 $LFS getstripe $DIR/$tdir/f0
3351 $LFS getstripe $DIR/$tdir/f1
3353 $LFS getstripe $DIR/$tdir/f2
3355 cancel_lru_locks mdc
3356 cancel_lru_locks osc
3358 echo "Inject failure..."
3359 echo "To simulate f0 lost MDT-object"
3360 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3361 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3364 echo "To simulate the case of f1 lost MDT-object and "
3365 echo "the first OST-object in each PFL component"
3366 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3367 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3370 echo "To simulate the case of f2 lost MDT-object and "
3371 echo "the second OST-object in each PFL component"
3372 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3377 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3379 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3380 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3382 for k in $(seq $MDSCOUNT); do
3383 # The LFSCK status query internal is 30 seconds. For the case
3384 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3385 # time to guarantee the status sync up.
3386 wait_update_facet mds${k} "$LCTL get_param -n \
3387 mdd.$(facet_svc mds${k}).lfsck_layout |
3388 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3389 error "(4) MDS${k} is not the expected 'completed'"
3392 for k in $(seq $OSTCOUNT); do
3393 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3394 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3395 awk '/^status/ { print $2 }')
3396 [ "$cur_status" == "completed" ] ||
3397 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3400 local repaired=$(do_facet mds1 $LCTL get_param -n \
3401 mdd.$(facet_svc mds1).lfsck_layout |
3402 awk '/^repaired_orphan/ { print $2 }')
3403 [ $repaired -eq 8 ] ||
3404 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3407 # ${fid0}-R-0 is the old f0
3409 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3410 echo "Check $name, which is the old f0"
3412 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3414 local pattern=$($LFS getstripe -L -I1 $name)
3415 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3416 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3418 pattern=$($LFS getstripe -L -I2 $name)
3419 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3420 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3422 local stripes=$($LFS getstripe -c -I1 $name)
3423 [ $stripes -eq 2 ] ||
3424 error "(7.3.1) expect 2 stripes, but got $stripes"
3426 stripes=$($LFS getstripe -c -I2 $name)
3427 [ $stripes -eq 2 ] ||
3428 error "(7.3.2) expect 2 stripes, but got $stripes"
3430 local e_start=$($LFS getstripe -I1 $name |
3431 awk '/lcme_extent.e_start:/ { print $2 }')
3432 [ $e_start -eq 0 ] ||
3433 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3435 local e_end=$($LFS getstripe -I1 $name |
3436 awk '/lcme_extent.e_end:/ { print $2 }')
3437 [ $e_end -eq 2097152 ] ||
3438 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3440 e_start=$($LFS getstripe -I2 $name |
3441 awk '/lcme_extent.e_start:/ { print $2 }')
3442 [ $e_start -eq 2097152 ] ||
3443 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3445 e_end=$($LFS getstripe -I2 $name |
3446 awk '/lcme_extent.e_end:/ { print $2 }')
3447 [ "$e_end" = "EOF" ] ||
3448 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3450 local size=$(stat $name | awk '/Size:/ { print $2 }')
3451 [ $size -eq $((4096 * $bcount)) ] ||
3452 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3454 cat $name > /dev/null || error "(7.7) cannot read $name"
3456 echo "dummy" >> $name || error "(7.8) cannot write $name"
3458 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3460 touch $name || error "(7.10) cannot touch $name"
3462 rm -f $name || error "(7.11) cannot unlink $name"
3465 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3467 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3468 echo "Check $name, it contains f1's second OST-object in each COMP"
3470 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3472 pattern=$($LFS getstripe -L -I1 $name)
3473 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3474 error "(8.2.1) expect pattern flag hole, but got $pattern"
3476 pattern=$($LFS getstripe -L -I2 $name)
3477 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3478 error "(8.2.2) expect pattern flag hole, but got $pattern"
3480 stripes=$($LFS getstripe -c -I1 $name)
3481 [ $stripes -eq 2 ] ||
3482 error "(8.3.2) expect 2 stripes, but got $stripes"
3484 stripes=$($LFS getstripe -c -I2 $name)
3485 [ $stripes -eq 2 ] ||
3486 error "(8.3.2) expect 2 stripes, but got $stripes"
3488 e_start=$($LFS getstripe -I1 $name |
3489 awk '/lcme_extent.e_start:/ { print $2 }')
3490 [ $e_start -eq 0 ] ||
3491 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3493 e_end=$($LFS getstripe -I1 $name |
3494 awk '/lcme_extent.e_end:/ { print $2 }')
3495 [ $e_end -eq 2097152 ] ||
3496 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3498 e_start=$($LFS getstripe -I2 $name |
3499 awk '/lcme_extent.e_start:/ { print $2 }')
3500 [ $e_start -eq 2097152 ] ||
3501 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3503 e_end=$($LFS getstripe -I2 $name |
3504 awk '/lcme_extent.e_end:/ { print $2 }')
3505 [ "$e_end" = "EOF" ] ||
3506 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3508 size=$(stat $name | awk '/Size:/ { print $2 }')
3509 [ $size -eq $((4096 * $bcount)) ] ||
3510 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3512 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3514 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3515 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3517 # The first stripe in each COMP was lost
3518 [ $failures -eq 512 ] ||
3519 error "(8.8) expect 512 IO failures, but get $failures"
3521 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3522 [ $size -eq $((4096 * $bcount)) ] ||
3523 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3525 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3526 error "(8.10) write to the LOV EA hole should fail"
3528 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3529 error "(8.11) write to normal stripe should NOT fail"
3531 echo "foo" >> $name && error "(8.12) append write $name should fail"
3533 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3535 touch $name || error "(8.14) cannot touch $name"
3537 rm -f $name || error "(8.15) cannot unlink $name"
3540 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3542 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3543 echo "Check $name, it contains f2's first stripe in each COMP"
3545 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3547 pattern=$($LFS getstripe -L -I1 $name)
3548 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3549 error "(9.2.1) expect pattern flag hole, but got $pattern"
3551 pattern=$($LFS getstripe -L -I2 $name)
3552 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3553 error "(9.2.2) expect pattern flag hole, but got $pattern"
3555 stripes=$($LFS getstripe -c -I1 $name)
3556 [ $stripes -eq 2 ] ||
3557 error "(9.3.2) expect 2 stripes, but got $stripes"
3559 stripes=$($LFS getstripe -c -I2 $name)
3560 [ $stripes -eq 2 ] ||
3561 error "(9.3.2) expect 2 stripes, but got $stripes"
3563 e_start=$($LFS getstripe -I1 $name |
3564 awk '/lcme_extent.e_start:/ { print $2 }')
3565 [ $e_start -eq 0 ] ||
3566 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3568 e_end=$($LFS getstripe -I1 $name |
3569 awk '/lcme_extent.e_end:/ { print $2 }')
3570 [ $e_end -eq 2097152 ] ||
3571 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3573 e_start=$($LFS getstripe -I2 $name |
3574 awk '/lcme_extent.e_start:/ { print $2 }')
3575 [ $e_start -eq 2097152 ] ||
3576 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3578 e_end=$($LFS getstripe -I2 $name |
3579 awk '/lcme_extent.e_end:/ { print $2 }')
3580 [ "$e_end" = "EOF" ] ||
3581 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3583 size=$(stat $name | awk '/Size:/ { print $2 }')
3584 # The second stripe in COMP was lost, so we do not know there
3585 # have ever been some data before. 'stat' will regard it as
3586 # no data on the lost stripe.
3588 [ $size -eq $((4096 * $bcount)) ] ||
3589 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3591 cat $name > /dev/null &&
3592 error "(9.7) normal read $name should fail"
3594 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3595 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3596 [ $failures -eq 512 ] ||
3597 error "(9.8) expect 256 IO failures, but get $failures"
3599 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3600 # The second stripe in COMP was lost, so we do not know there
3601 # have ever been some data before. Since 'dd' skip failure,
3602 # it will regard the lost stripe contains data.
3604 [ $size -eq $((4096 * $bcount)) ] ||
3605 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3607 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3608 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3610 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3611 error "(9.11) write to normal stripe should NOT fail"
3613 echo "foo" >> $name &&
3614 error "(9.12) append write $name should fail"
3616 chown $RUNAS_ID:$RUNAS_GID $name ||
3617 error "(9.13) cannot chown on $name"
3619 touch $name || error "(9.14) cannot touch $name"
3621 rm -f $name || error "(7.15) cannot unlink $name"
3623 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3626 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3627 skip "ignore the test if MDS is older than 2.5.59" && return
3629 check_mount_and_prep
3630 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3632 echo "Start all LFSCK components by default (-s 1)"
3633 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3634 error "Fail to start LFSCK"
3636 echo "namespace LFSCK should be in 'scanning-phase1' status"
3637 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3638 [ "$STATUS" == "scanning-phase1" ] ||
3639 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3641 echo "layout LFSCK should be in 'scanning-phase1' status"
3642 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3643 [ "$STATUS" == "scanning-phase1" ] ||
3644 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3646 echo "Stop all LFSCK components by default"
3647 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3648 error "Fail to stop LFSCK"
3650 run_test 21 "run all LFSCK components by default"
3653 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3656 echo "The parent_A references the child directory via some name entry,"
3657 echo "but the child directory back references another parent_B via its"
3658 echo "".." name entry. The parent_B does not exist. Then the namespace"
3659 echo "LFSCK will repair the child directory's ".." name entry."
3662 check_mount_and_prep
3664 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3665 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3667 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3668 echo "The dummy's dotdot name entry references the guard."
3669 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3671 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3672 error "(3) Fail to mkdir on MDT0"
3673 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3675 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3677 echo "Trigger namespace LFSCK to repair unmatched pairs"
3678 $START_NAMESPACE -A -r ||
3679 error "(5) Fail to start LFSCK for namespace"
3681 wait_all_targets_blocked namespace completed 6
3683 local repaired=$($SHOW_NAMESPACE |
3684 awk '/^unmatched_pairs_repaired/ { print $2 }')
3685 [ $repaired -eq 1 ] ||
3686 error "(7) Fail to repair unmatched pairs: $repaired"
3688 echo "'ls' should success after namespace LFSCK repairing"
3689 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3690 error "(8) ls should success."
3692 run_test 22a "LFSCK can repair unmatched pairs (1)"
3695 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3698 echo "The parent_A references the child directory via the name entry_B,"
3699 echo "but the child directory back references another parent_C via its"
3700 echo "".." name entry. The parent_C exists, but there is no the name"
3701 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3702 echo "the child directory's ".." name entry and its linkEA."
3705 check_mount_and_prep
3707 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3708 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3710 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3711 echo "and bad linkEA. The dummy's dotdot name entry references the"
3712 echo "guard. The dummy's linkEA references n non-exist name entry."
3713 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3714 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3715 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3716 error "(3) Fail to mkdir on MDT0"
3717 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3719 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3720 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3721 local dummyname=$($LFS fid2path $DIR $dummyfid)
3722 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3723 error "(4) fid2path works unexpectedly."
3725 echo "Trigger namespace LFSCK to repair unmatched pairs"
3726 $START_NAMESPACE -A -r ||
3727 error "(5) Fail to start LFSCK for namespace"
3729 wait_all_targets_blocked namespace completed 6
3731 local repaired=$($SHOW_NAMESPACE |
3732 awk '/^unmatched_pairs_repaired/ { print $2 }')
3733 [ $repaired -eq 1 ] ||
3734 error "(7) Fail to repair unmatched pairs: $repaired"
3736 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3737 local dummyname=$($LFS fid2path $DIR $dummyfid)
3738 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3739 error "(8) fid2path does not work"
3741 run_test 22b "LFSCK can repair unmatched pairs (2)"
3744 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3747 echo "The name entry is there, but the MDT-object for such name "
3748 echo "entry does not exist. The namespace LFSCK should find out "
3749 echo "and repair the inconsistency as required."
3752 check_mount_and_prep
3754 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3755 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3757 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3758 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3759 do_facet mds2 $LCTL set_param fail_loc=0x1620
3760 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3761 do_facet mds2 $LCTL set_param fail_loc=0
3763 echo "'ls' should fail because of dangling name entry"
3764 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3766 echo "Trigger namespace LFSCK to find out dangling name entry"
3767 $START_NAMESPACE -A -r ||
3768 error "(5) Fail to start LFSCK for namespace"
3770 wait_all_targets_blocked namespace completed 6
3772 local repaired=$($SHOW_NAMESPACE |
3773 awk '/^dangling_repaired/ { print $2 }')
3774 [ $repaired -eq 1 ] ||
3775 error "(7) Fail to repair dangling name entry: $repaired"
3777 echo "'ls' should fail because not re-create MDT-object by default"
3778 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3780 echo "Trigger namespace LFSCK again to repair dangling name entry"
3781 $START_NAMESPACE -A -r -C ||
3782 error "(9) Fail to start LFSCK for namespace"
3784 wait_all_targets_blocked namespace completed 10
3786 repaired=$($SHOW_NAMESPACE |
3787 awk '/^dangling_repaired/ { print $2 }')
3788 [ $repaired -eq 1 ] ||
3789 error "(11) Fail to repair dangling name entry: $repaired"
3791 echo "'ls' should success after namespace LFSCK repairing"
3792 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3794 run_test 23a "LFSCK can repair dangling name entry (1)"
3798 echo "The objectA has multiple hard links, one of them corresponding"
3799 echo "to the name entry_B. But there is something wrong for the name"
3800 echo "entry_B and cause entry_B to references non-exist object_C."
3801 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3802 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3803 echo "comes to the second-stage scanning, it will find that the"
3804 echo "former re-creating object_C is not proper, and will try to"
3805 echo "replace the object_C with the real object_A."
3808 check_mount_and_prep
3810 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3811 $LFS path2fid $DIR/$tdir/d0
3813 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3815 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3816 $LFS path2fid $DIR/$tdir/d0/f0
3818 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3819 $LFS path2fid $DIR/$tdir/d0/f1
3821 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3822 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3824 if [ "$SEQ0" != "$SEQ1" ]; then
3825 # To guarantee that the f0 and f1 are in the same FID seq
3826 rm -f $DIR/$tdir/d0/f0 ||
3827 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3828 echo "dummy" > $DIR/$tdir/d0/f0 ||
3829 error "(3.2) Fail to touch on MDT0"
3830 $LFS path2fid $DIR/$tdir/d0/f0
3833 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3834 OID=$(printf %d $OID)
3836 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3837 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3838 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3839 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3840 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3842 # If there is creation after the dangling injection, it may re-use
3843 # the just released local object (inode) that is referenced by the
3844 # dangling name entry. It will fail the dangling injection.
3845 # So before deleting the target object for the dangling name entry,
3846 # remove some other objects to avoid the target object being reused
3847 # by some potential creations. LU-7429
3848 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3850 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3852 echo "'ls' should fail because of dangling name entry"
3853 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3854 error "(6) ls should fail."
3856 echo "Trigger namespace LFSCK to find out dangling name entry"
3857 $START_NAMESPACE -r -C ||
3858 error "(7) Fail to start LFSCK for namespace"
3860 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3861 mdd.${MDT_DEV}.lfsck_namespace |
3862 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3864 error "(8) unexpected status"
3867 local repaired=$($SHOW_NAMESPACE |
3868 awk '/^dangling_repaired/ { print $2 }')
3869 [ $repaired -eq 1 ] ||
3870 error "(9) Fail to repair dangling name entry: $repaired"
3872 repaired=$($SHOW_NAMESPACE |
3873 awk '/^multiple_linked_repaired/ { print $2 }')
3874 [ $repaired -eq 1 ] ||
3875 error "(10) Fail to drop the former created object: $repaired"
3877 local data=$(cat $DIR/$tdir/d0/foo)
3878 [ "$data" == "dummy" ] ||
3879 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3881 run_test 23b "LFSCK can repair dangling name entry (2)"
3885 echo "The objectA has multiple hard links, one of them corresponding"
3886 echo "to the name entry_B. But there is something wrong for the name"
3887 echo "entry_B and cause entry_B to references non-exist object_C."
3888 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3889 echo "as dangling, and re-create the lost object_C. And then others"
3890 echo "modified the re-created object_C. When the LFSCK comes to the"
3891 echo "second-stage scanning, it will find that the former re-creating"
3892 echo "object_C maybe wrong and try to replace the object_C with the"
3893 echo "real object_A. But because object_C has been modified, so the"
3894 echo "LFSCK cannot replace it."
3897 start_full_debug_logging
3899 check_mount_and_prep
3901 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3902 $LFS path2fid $DIR/$tdir/d0
3904 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3906 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3907 $LFS path2fid $DIR/$tdir/d0/f0
3909 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3910 $LFS path2fid $DIR/$tdir/d0/f1
3912 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3913 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3915 if [ "$SEQ0" != "$SEQ1" ]; then
3916 # To guarantee that the f0 and f1 are in the same FID seq
3917 rm -f $DIR/$tdir/d0/f0 ||
3918 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3919 echo "dummy" > $DIR/$tdir/d0/f0 ||
3920 error "(3.2) Fail to touch on MDT0"
3921 $LFS path2fid $DIR/$tdir/d0/f0
3924 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3925 OID=$(printf %d $OID)
3927 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3928 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3929 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3930 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3931 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3933 # If there is creation after the dangling injection, it may re-use
3934 # the just released local object (inode) that is referenced by the
3935 # dangling name entry. It will fail the dangling injection.
3936 # So before deleting the target object for the dangling name entry,
3937 # remove some other objects to avoid the target object being reused
3938 # by some potential creations. LU-7429
3939 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3941 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3943 echo "'ls' should fail because of dangling name entry"
3944 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3945 error "(6) ls should fail."
3947 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3948 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3950 echo "Trigger namespace LFSCK to find out dangling name entry"
3951 $START_NAMESPACE -r -C ||
3952 error "(7) Fail to start LFSCK for namespace"
3954 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3955 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3956 stat $DIR/$tdir/d0/foo
3958 error "(8) unexpected size"
3961 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3962 cancel_lru_locks osc
3964 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3965 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3966 mdd.${MDT_DEV}.lfsck_namespace |
3967 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3969 error "(10) unexpected status"
3972 stop_full_debug_logging
3974 local repaired=$($SHOW_NAMESPACE |
3975 awk '/^dangling_repaired/ { print $2 }')
3976 [ $repaired -eq 1 ] ||
3977 error "(11) Fail to repair dangling name entry: $repaired"
3979 local data=$(cat $DIR/$tdir/d0/foo)
3980 [ "$data" != "dummy" ] ||
3981 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3983 run_test 23c "LFSCK can repair dangling name entry (3)"
3986 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3989 echo "Two MDT-objects back reference the same name entry via their"
3990 echo "each own linkEA entry, but the name entry only references one"
3991 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3992 echo "for the MDT-object that is not recognized. If such MDT-object"
3993 echo "has no other linkEA entry after the removing, then the LFSCK"
3994 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3997 check_mount_and_prep
3999 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4001 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4002 $LFS path2fid $DIR/$tdir/d0/guard
4004 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4005 $LFS path2fid $DIR/$tdir/d0/dummy
4008 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4009 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4011 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4014 touch $DIR/$tdir/d0/guard/foo ||
4015 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4017 echo "Inject failure stub on MDT0 to simulate the case that"
4018 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4019 echo "that references $DIR/$tdir/d0/guard/foo."
4020 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4021 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4022 echo "there with the same linkEA entry as another MDT-object"
4023 echo "$DIR/$tdir/d0/guard/foo has"
4025 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4027 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4028 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4029 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4030 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4031 rmdir $DIR/$tdir/d0/dummy/foo ||
4032 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4033 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4035 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4036 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4037 error "(6) stat successfully unexpectedly"
4039 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4040 $START_NAMESPACE -A -r ||
4041 error "(7) Fail to start LFSCK for namespace"
4043 wait_all_targets_blocked namespace completed 8
4045 local repaired=$($SHOW_NAMESPACE |
4046 awk '/^multiple_referenced_repaired/ { print $2 }')
4047 [ $repaired -eq 1 ] ||
4048 error "(9) Fail to repair multiple referenced name entry: $repaired"
4050 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4051 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4052 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4054 local cname="$cfid-$pfid-D-0"
4055 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4056 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4058 run_test 24 "LFSCK can repair multiple-referenced name entry"
4061 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4062 skip "ldiskfs only test" && return
4065 echo "The file type in the name entry does not match the file type"
4066 echo "claimed by the referenced object. Then the LFSCK will update"
4067 echo "the file type in the name entry."
4070 check_mount_and_prep
4072 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4074 echo "Inject failure stub on MDT0 to simulate the case that"
4075 echo "the file type stored in the name entry is wrong."
4077 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4078 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4079 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4080 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4082 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4083 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4085 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4086 mdd.${MDT_DEV}.lfsck_namespace |
4087 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4089 error "(4) unexpected status"
4092 local repaired=$($SHOW_NAMESPACE |
4093 awk '/^bad_file_type_repaired/ { print $2 }')
4094 [ $repaired -eq 1 ] ||
4095 error "(5) Fail to repair bad file type in name entry: $repaired"
4097 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4099 run_test 25 "LFSCK can repair bad file type in the name entry"
4103 echo "The local name entry back referenced by the MDT-object is lost."
4104 echo "The namespace LFSCK will add the missing local name entry back"
4105 echo "to the normal namespace."
4108 check_mount_and_prep
4110 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4111 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4112 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4114 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4115 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4117 echo "Inject failure stub on MDT0 to simulate the case that"
4118 echo "foo's name entry will be removed, but the foo's object"
4119 echo "and its linkEA are kept in the system."
4121 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4122 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4123 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4124 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4126 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4127 error "(5) 'ls' should fail"
4129 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4130 $START_NAMESPACE -r -A ||
4131 error "(6) Fail to start LFSCK for namespace"
4133 wait_all_targets_blocked namespace completed 7
4135 local repaired=$($SHOW_NAMESPACE |
4136 awk '/^lost_dirent_repaired/ { print $2 }')
4137 [ $repaired -eq 1 ] ||
4138 error "(8) Fail to repair lost dirent: $repaired"
4140 ls -ail $DIR/$tdir/d0/foo ||
4141 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4143 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4144 [ "$foofid" == "$foofid2" ] ||
4145 error "(10) foo's FID changed: $foofid, $foofid2"
4147 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4150 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4153 echo "The remote name entry back referenced by the MDT-object is lost."
4154 echo "The namespace LFSCK will add the missing remote name entry back"
4155 echo "to the normal namespace."
4158 check_mount_and_prep
4160 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4161 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4162 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4164 echo "Inject failure stub on MDT0 to simulate the case that"
4165 echo "foo's name entry will be removed, but the foo's object"
4166 echo "and its linkEA are kept in the system."
4168 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4170 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4173 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4174 error "(4) 'ls' should fail"
4176 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4177 $START_NAMESPACE -r -A ||
4178 error "(5) Fail to start LFSCK for namespace"
4180 wait_all_targets_blocked namespace completed 6
4182 local repaired=$($SHOW_NAMESPACE |
4183 awk '/^lost_dirent_repaired/ { print $2 }')
4184 [ $repaired -eq 1 ] ||
4185 error "(7) Fail to repair lost dirent: $repaired"
4187 ls -ail $DIR/$tdir/d0/foo ||
4188 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4190 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4191 [ "$foofid" == "$foofid2" ] ||
4192 error "(9) foo's FID changed: $foofid, $foofid2"
4194 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4198 echo "The local parent referenced by the MDT-object linkEA is lost."
4199 echo "The namespace LFSCK will re-create the lost parent as orphan."
4202 check_mount_and_prep
4204 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4205 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4206 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4207 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4209 echo "Inject failure stub on MDT0 to simulate the case that"
4210 echo "foo's name entry will be removed, but the foo's object"
4211 echo "and its linkEA are kept in the system. And then remove"
4212 echo "another hard link and the parent directory."
4214 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4216 rm -f $DIR/$tdir/d0/foo ||
4217 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4218 rm -f $DIR/$tdir/d0/dummy ||
4219 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4222 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4223 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4225 echo "Trigger namespace LFSCK to repair the lost parent"
4226 $START_NAMESPACE -r -A ||
4227 error "(6) Fail to start LFSCK for namespace"
4229 wait_all_targets_blocked namespace completed 7
4231 local repaired=$($SHOW_NAMESPACE |
4232 awk '/^lost_dirent_repaired/ { print $2 }')
4233 [ $repaired -eq 1 ] ||
4234 error "(8) Fail to repair lost dirent: $repaired"
4236 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4237 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4238 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4240 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4242 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4243 [ ! -z "$cname" ] ||
4244 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4246 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4249 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4252 echo "The remote parent referenced by the MDT-object linkEA is lost."
4253 echo "The namespace LFSCK will re-create the lost parent as orphan."
4256 check_mount_and_prep
4258 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4259 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4261 $LFS path2fid $DIR/$tdir/d0
4263 echo "Inject failure stub on MDT0 to simulate the case that"
4264 echo "foo's name entry will be removed, but the foo's object"
4265 echo "and its linkEA are kept in the system. And then remove"
4266 echo "the parent directory."
4268 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4270 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4273 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4274 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4276 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4277 $START_NAMESPACE -r -A ||
4278 error "(6) Fail to start LFSCK for namespace"
4280 wait_all_targets_blocked namespace completed 7
4282 local repaired=$($SHOW_NAMESPACE |
4283 awk '/^lost_dirent_repaired/ { print $2 }')
4284 [ $repaired -eq 1 ] ||
4285 error "(8) Fail to repair lost dirent: $repaired"
4287 ls -ail $MOUNT/.lustre/lost+found/
4289 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4290 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4291 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4293 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4295 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4296 [ ! -z "$cname" ] ||
4297 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4299 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4302 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4305 echo "The target name entry is lost. The LFSCK should insert the"
4306 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4307 echo "the MDT (on which the orphan MDT-object resides) has ever"
4308 echo "failed to respond some name entry verification during the"
4309 echo "first stage-scanning, then the LFSCK should skip to handle"
4310 echo "orphan MDT-object on this MDT. But other MDTs should not"
4314 check_mount_and_prep
4315 $LFS mkdir -i 0 $DIR/$tdir/d1
4316 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4317 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4319 $LFS mkdir -i 1 $DIR/$tdir/d2
4320 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4321 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4323 echo "Inject failure stub on MDT0 to simulate the case that"
4324 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4325 echo "and its linkEA are kept in the system. And the case that"
4326 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4327 echo "and its linkEA are kept in the system."
4329 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4330 do_facet mds1 $LCTL set_param fail_loc=0x1624
4331 do_facet mds2 $LCTL set_param fail_loc=0x1624
4332 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4333 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4334 do_facet mds1 $LCTL set_param fail_loc=0
4335 do_facet mds2 $LCTL set_param fail_loc=0
4337 cancel_lru_locks mdc
4338 cancel_lru_locks osc
4340 echo "Inject failure, to simulate the MDT0 fail to handle"
4341 echo "MDT1 LFSCK request during the first-stage scanning."
4342 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4343 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4345 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4346 $START_NAMESPACE -r -A ||
4347 error "(3) Fail to start LFSCK for namespace"
4349 wait_update_facet mds1 "$LCTL get_param -n \
4350 mdd.$(facet_svc mds1).lfsck_namespace |
4351 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4352 error "(4) mds1 is not the expected 'partial'"
4355 wait_update_facet mds2 "$LCTL get_param -n \
4356 mdd.$(facet_svc mds2).lfsck_namespace |
4357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4358 error "(5) mds2 is not the expected 'completed'"
4361 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4363 local repaired=$(do_facet mds1 $LCTL get_param -n \
4364 mdd.$(facet_svc mds1).lfsck_namespace |
4365 awk '/^lost_dirent_repaired/ { print $2 }')
4366 [ $repaired -eq 0 ] ||
4367 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4369 repaired=$(do_facet mds2 $LCTL get_param -n \
4370 mdd.$(facet_svc mds2).lfsck_namespace |
4371 awk '/^lost_dirent_repaired/ { print $2 }')
4372 [ $repaired -eq 1 ] ||
4373 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4375 echo "Trigger namespace LFSCK on all devices again to cleanup"
4376 $START_NAMESPACE -r -A ||
4377 error "(8) Fail to start LFSCK for namespace"
4379 wait_all_targets_blocked namespace completed 9
4381 local repaired=$(do_facet mds1 $LCTL get_param -n \
4382 mdd.$(facet_svc mds1).lfsck_namespace |
4383 awk '/^lost_dirent_repaired/ { print $2 }')
4384 [ $repaired -eq 1 ] ||
4385 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4387 repaired=$(do_facet mds2 $LCTL get_param -n \
4388 mdd.$(facet_svc mds2).lfsck_namespace |
4389 awk '/^lost_dirent_repaired/ { print $2 }')
4390 [ $repaired -eq 0 ] ||
4391 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4393 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4397 echo "The object's nlink attribute is larger than the object's known"
4398 echo "name entries count. The LFSCK will repair the object's nlink"
4399 echo "attribute to match the known name entries count"
4402 check_mount_and_prep
4404 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4405 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4407 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4408 echo "nlink attribute is larger than its name entries count."
4410 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4412 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4413 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4414 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4416 cancel_lru_locks mdc
4417 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4418 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4420 echo "Trigger namespace LFSCK to repair the nlink count"
4421 $START_NAMESPACE -r -A ||
4422 error "(5) Fail to start LFSCK for namespace"
4424 wait_all_targets_blocked namespace completed 6
4426 local repaired=$($SHOW_NAMESPACE |
4427 awk '/^nlinks_repaired/ { print $2 }')
4428 [ $repaired -eq 1 ] ||
4429 error "(7) Fail to repair nlink count: $repaired"
4431 cancel_lru_locks mdc
4432 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4433 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4435 # Disable 29a, we only allow nlink to be updated if the known linkEA
4436 # entries is larger than nlink count.
4438 #run_test 29a "LFSCK can repair bad nlink count (1)"
4442 echo "The object's nlink attribute is smaller than the object's known"
4443 echo "name entries count. The LFSCK will repair the object's nlink"
4444 echo "attribute to match the known name entries count"
4447 check_mount_and_prep
4449 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4450 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4452 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4453 echo "nlink attribute is smaller than its name entries count."
4455 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4456 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4457 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4458 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4459 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4461 cancel_lru_locks mdc
4462 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4463 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4465 echo "Trigger namespace LFSCK to repair the nlink count"
4466 $START_NAMESPACE -r -A ||
4467 error "(5) Fail to start LFSCK for namespace"
4469 wait_all_targets_blocked namespace completed 6
4471 local repaired=$($SHOW_NAMESPACE |
4472 awk '/^nlinks_repaired/ { print $2 }')
4473 [ $repaired -eq 1 ] ||
4474 error "(7) Fail to repair nlink count: $repaired"
4476 cancel_lru_locks mdc
4477 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4478 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4480 run_test 29b "LFSCK can repair bad nlink count (2)"
4485 echo "The namespace LFSCK will create many hard links to the target"
4486 echo "file as to exceed the linkEA size limitation. Under such case"
4487 echo "the linkEA will be marked as overflow that will prevent the"
4488 echo "target file to be migrated. Then remove some hard links to"
4489 echo "make the left hard links to be held within the linkEA size"
4490 echo "limitation. But before the namespace LFSCK adding all the"
4491 echo "missed linkEA entries back, the overflow mark (timestamp)"
4492 echo "will not be cleared."
4495 check_mount_and_prep
4497 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4498 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4499 error "(0.2) Fail to mkdir"
4500 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4501 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4503 # define MAX_LINKEA_SIZE 4096
4504 # sizeof(link_ea_header) = 24
4505 # sizeof(link_ea_entry) = 18
4506 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4507 # (sizeof(link_ea_entry) + name_length))
4508 # If the average name length is 12 bytes, then 150 hard links
4509 # is totally enough to overflow the linkEA
4510 echo "Create 150 hard links should succeed although the linkEA overflow"
4511 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4512 error "(2) Fail to hard link"
4514 cancel_lru_locks mdc
4515 if [ $MDSCOUNT -ge 2 ]; then
4516 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4517 error "(3.1) Migrate failure"
4519 echo "The object with linkEA overflow should NOT be migrated"
4520 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4521 [ "$newfid" == "$oldfid" ] ||
4522 error "(3.2) Migrate should fail: $newfid != $oldfid"
4525 # Remove 100 hard links, then the linkEA should have space
4526 # to hold the missed linkEA entries.
4527 echo "Remove 100 hard links to save space for the missed linkEA entries"
4528 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4530 if [ $MDSCOUNT -ge 2 ]; then
4531 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4532 error "(5.1) Migrate failure"
4534 # The overflow timestamp is still there, so migration will fail.
4535 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4536 [ "$newfid" == "$oldfid" ] ||
4537 error "(5.2) Migrate should fail: $newfid != $oldfid"
4540 # sleep 3 seconds to guarantee that the overflow is recognized
4543 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4544 $START_NAMESPACE -r -A ||
4545 error "(6) Fail to start LFSCK for namespace"
4547 wait_all_targets_blocked namespace completed 7
4549 local repaired=$($SHOW_NAMESPACE |
4550 awk '/^linkea_overflow_cleared/ { print $2 }')
4551 [ $repaired -eq 1 ] ||
4552 error "(8) Fail to clear linkea overflow: $repaired"
4554 repaired=$($SHOW_NAMESPACE |
4555 awk '/^nlinks_repaired/ { print $2 }')
4556 [ $repaired -eq 0 ] ||
4557 error "(9) Unexpected nlink repaired: $repaired"
4559 if [ $MDSCOUNT -ge 2 ]; then
4560 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4561 error "(10.1) Migrate failure"
4563 # Migration should succeed after clear the overflow timestamp.
4564 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4565 [ "$newfid" != "$oldfid" ] ||
4566 error "(10.2) Migrate should succeed"
4568 ls -l $DIR/$tdir/foo > /dev/null ||
4569 error "(11) 'ls' failed after migration"
4572 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4573 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4575 run_test 29c "verify linkEA size limitation"
4578 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4579 skip "ldiskfs only test" && return
4582 echo "The namespace LFSCK will move the orphans from backend"
4583 echo "/lost+found directory to normal client visible namespace"
4584 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4587 check_mount_and_prep
4589 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4590 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4592 echo "Inject failure stub on MDT0 to simulate the case that"
4593 echo "directory d0 has no linkEA entry, then the LFSCK will"
4594 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4596 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4597 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4598 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4599 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4601 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4602 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4604 echo "Inject failure stub on MDT0 to simulate the case that the"
4605 echo "object's name entry will be removed, but not destroy the"
4606 echo "object. Then backend e2fsck will handle it as orphan and"
4607 echo "add them into the backend /lost+found directory."
4609 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4610 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4611 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4612 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4613 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4614 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4617 umount_client $MOUNT || error "(10) Fail to stop client!"
4619 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4622 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4623 error "(12) Fail to run e2fsck"
4625 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4626 error "(13) Fail to start MDT0"
4628 echo "Trigger namespace LFSCK to recover backend orphans"
4629 $START_NAMESPACE -r -A ||
4630 error "(14) Fail to start LFSCK for namespace"
4632 wait_all_targets_blocked namespace completed 15
4634 local repaired=$($SHOW_NAMESPACE |
4635 awk '/^local_lost_found_moved/ { print $2 }')
4636 [ $repaired -ge 4 ] ||
4637 error "(16) Fail to recover backend orphans: $repaired"
4639 mount_client $MOUNT || error "(17) Fail to start client!"
4641 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4643 ls -ail $MOUNT/.lustre/lost+found/
4645 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4646 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4647 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4649 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4651 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4652 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4654 stat ${cname}/d1 || error "(21) d0 is not recovered"
4655 stat ${cname}/f1 || error "(22) f1 is not recovered"
4657 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4660 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4663 echo "For the name entry under a striped directory, if the name"
4664 echo "hash does not match the shard, then the LFSCK will repair"
4665 echo "the bad name entry"
4668 check_mount_and_prep
4670 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4671 error "(1) Fail to create striped directory"
4673 echo "Inject failure stub on client to simulate the case that"
4674 echo "some name entry should be inserted into other non-first"
4675 echo "shard, but inserted into the first shard by wrong"
4677 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4678 $LCTL set_param fail_loc=0x1628 fail_val=0
4679 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4680 error "(2) Fail to create file under striped directory"
4681 $LCTL set_param fail_loc=0 fail_val=0
4683 echo "Trigger namespace LFSCK to repair bad name hash"
4684 $START_NAMESPACE -r -A ||
4685 error "(3) Fail to start LFSCK for namespace"
4687 wait_all_targets_blocked namespace completed 4
4689 local repaired=$($SHOW_NAMESPACE |
4690 awk '/^name_hash_repaired/ { print $2 }')
4691 [ $repaired -ge 1 ] ||
4692 error "(5) Fail to repair bad name hash: $repaired"
4694 umount_client $MOUNT || error "(6) umount failed"
4695 mount_client $MOUNT || error "(7) mount failed"
4697 for ((i = 0; i < $MDSCOUNT; i++)); do
4698 stat $DIR/$tdir/striped_dir/d$i ||
4699 error "(8) Fail to stat d$i after LFSCK"
4700 rmdir $DIR/$tdir/striped_dir/d$i ||
4701 error "(9) Fail to unlink d$i after LFSCK"
4704 rmdir $DIR/$tdir/striped_dir ||
4705 error "(10) Fail to remove the striped directory after LFSCK"
4707 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4710 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4713 echo "For the name entry under a striped directory, if the name"
4714 echo "hash does not match the shard, then the LFSCK will repair"
4715 echo "the bad name entry"
4718 check_mount_and_prep
4720 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4721 error "(1) Fail to create striped directory"
4723 echo "Inject failure stub on client to simulate the case that"
4724 echo "some name entry should be inserted into other non-second"
4725 echo "shard, but inserted into the secod shard by wrong"
4727 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4728 $LCTL set_param fail_loc=0x1628 fail_val=1
4729 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4730 error "(2) Fail to create file under striped directory"
4731 $LCTL set_param fail_loc=0 fail_val=0
4733 echo "Trigger namespace LFSCK to repair bad name hash"
4734 $START_NAMESPACE -r -A ||
4735 error "(3) Fail to start LFSCK for namespace"
4737 wait_all_targets_blocked namespace completed 4
4739 local repaired=$(do_facet mds2 $LCTL get_param -n \
4740 mdd.$(facet_svc mds2).lfsck_namespace |
4741 awk '/^name_hash_repaired/ { print $2 }')
4742 [ $repaired -ge 1 ] ||
4743 error "(5) Fail to repair bad name hash: $repaired"
4745 umount_client $MOUNT || error "(6) umount failed"
4746 mount_client $MOUNT || error "(7) mount failed"
4748 for ((i = 0; i < $MDSCOUNT; i++)); do
4749 stat $DIR/$tdir/striped_dir/d$i ||
4750 error "(8) Fail to stat d$i after LFSCK"
4751 rmdir $DIR/$tdir/striped_dir/d$i ||
4752 error "(9) Fail to unlink d$i after LFSCK"
4755 rmdir $DIR/$tdir/striped_dir ||
4756 error "(10) Fail to remove the striped directory after LFSCK"
4758 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4761 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4764 echo "For some reason, the master MDT-object of the striped directory"
4765 echo "may lost its master LMV EA. If nobody created files under the"
4766 echo "master directly after the master LMV EA lost, then the LFSCK"
4767 echo "should re-generate the master LMV EA."
4770 check_mount_and_prep
4772 echo "Inject failure stub on MDT0 to simulate the case that the"
4773 echo "master MDT-object of the striped directory lost the LMV EA."
4775 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4776 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4777 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4778 error "(1) Fail to create striped directory"
4779 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4781 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4782 $START_NAMESPACE -r -A ||
4783 error "(2) Fail to start LFSCK for namespace"
4785 wait_all_targets_blocked namespace completed 3
4787 local repaired=$($SHOW_NAMESPACE |
4788 awk '/^striped_dirs_repaired/ { print $2 }')
4789 [ $repaired -eq 1 ] ||
4790 error "(4) Fail to re-generate master LMV EA: $repaired"
4792 umount_client $MOUNT || error "(5) umount failed"
4793 mount_client $MOUNT || error "(6) mount failed"
4795 local empty=$(ls $DIR/$tdir/striped_dir/)
4796 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4798 rmdir $DIR/$tdir/striped_dir ||
4799 error "(8) Fail to remove the striped directory after LFSCK"
4801 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4804 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4807 echo "For some reason, the master MDT-object of the striped directory"
4808 echo "may lost its master LMV EA. If somebody created files under the"
4809 echo "master directly after the master LMV EA lost, then the LFSCK"
4810 echo "should NOT re-generate the master LMV EA, instead, it should"
4811 echo "change the broken striped dirctory as read-only to prevent"
4812 echo "further damage"
4815 check_mount_and_prep
4817 echo "Inject failure stub on MDT0 to simulate the case that the"
4818 echo "master MDT-object of the striped directory lost the LMV EA."
4820 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4822 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4823 error "(1) Fail to create striped directory"
4824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4826 umount_client $MOUNT || error "(2) umount failed"
4827 mount_client $MOUNT || error "(3) mount failed"
4829 touch $DIR/$tdir/striped_dir/dummy ||
4830 error "(4) Fail to touch under broken striped directory"
4832 echo "Trigger namespace LFSCK to find out the inconsistency"
4833 $START_NAMESPACE -r -A ||
4834 error "(5) Fail to start LFSCK for namespace"
4836 wait_all_targets_blocked namespace completed 6
4838 local repaired=$($SHOW_NAMESPACE |
4839 awk '/^striped_dirs_repaired/ { print $2 }')
4840 [ $repaired -eq 0 ] ||
4841 error "(7) Re-generate master LMV EA unexpected: $repaired"
4843 stat $DIR/$tdir/striped_dir/dummy ||
4844 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4846 touch $DIR/$tdir/striped_dir/foo &&
4847 error "(9) The broken striped directory should be read-only"
4849 chattr -i $DIR/$tdir/striped_dir ||
4850 error "(10) Fail to chattr on the broken striped directory"
4852 rmdir $DIR/$tdir/striped_dir ||
4853 error "(11) Fail to remove the striped directory after LFSCK"
4855 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4858 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4861 echo "For some reason, the slave MDT-object of the striped directory"
4862 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4863 echo "slave LMV EA."
4866 check_mount_and_prep
4868 echo "Inject failure stub on MDT0 to simulate the case that the"
4869 echo "slave MDT-object (that resides on the same MDT as the master"
4870 echo "MDT-object resides on) lost the LMV EA."
4872 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4873 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4874 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4875 error "(1) Fail to create striped directory"
4876 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4878 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4879 $START_NAMESPACE -r -A ||
4880 error "(2) Fail to start LFSCK for namespace"
4882 wait_all_targets_blocked namespace completed 3
4884 local repaired=$($SHOW_NAMESPACE |
4885 awk '/^striped_shards_repaired/ { print $2 }')
4886 [ $repaired -eq 1 ] ||
4887 error "(4) Fail to re-generate slave LMV EA: $repaired"
4889 rmdir $DIR/$tdir/striped_dir ||
4890 error "(5) Fail to remove the striped directory after LFSCK"
4892 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4895 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4898 echo "For some reason, the slave MDT-object of the striped directory"
4899 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4900 echo "slave LMV EA."
4903 check_mount_and_prep
4905 echo "Inject failure stub on MDT0 to simulate the case that the"
4906 echo "slave MDT-object (that resides on different MDT as the master"
4907 echo "MDT-object resides on) lost the LMV EA."
4909 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4911 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4912 error "(1) Fail to create striped directory"
4913 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4915 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4916 $START_NAMESPACE -r -A ||
4917 error "(2) Fail to start LFSCK for namespace"
4919 wait_all_targets_blocked namespace completed 3
4921 local repaired=$(do_facet mds2 $LCTL get_param -n \
4922 mdd.$(facet_svc mds2).lfsck_namespace |
4923 awk '/^striped_shards_repaired/ { print $2 }')
4924 [ $repaired -eq 1 ] ||
4925 error "(4) Fail to re-generate slave LMV EA: $repaired"
4927 rmdir $DIR/$tdir/striped_dir ||
4928 error "(5) Fail to remove the striped directory after LFSCK"
4930 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4933 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4936 echo "For some reason, the stripe index in the slave LMV EA is"
4937 echo "corrupted. The LFSCK should repair the slave LMV EA."
4940 check_mount_and_prep
4942 echo "Inject failure stub on MDT0 to simulate the case that the"
4943 echo "slave LMV EA on the first shard of the striped directory"
4944 echo "claims the same index as the second shard claims"
4946 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4947 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4948 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4949 error "(1) Fail to create striped directory"
4950 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4952 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4953 $START_NAMESPACE -r -A ||
4954 error "(2) Fail to start LFSCK for namespace"
4956 wait_all_targets_blocked namespace completed 3
4958 local repaired=$($SHOW_NAMESPACE |
4959 awk '/^striped_shards_repaired/ { print $2 }')
4960 [ $repaired -eq 1 ] ||
4961 error "(4) Fail to repair slave LMV EA: $repaired"
4963 umount_client $MOUNT || error "(5) umount failed"
4964 mount_client $MOUNT || error "(6) mount failed"
4966 touch $DIR/$tdir/striped_dir/foo ||
4967 error "(7) Fail to touch file after the LFSCK"
4969 rm -f $DIR/$tdir/striped_dir/foo ||
4970 error "(8) Fail to unlink file after the LFSCK"
4972 rmdir $DIR/$tdir/striped_dir ||
4973 error "(9) Fail to remove the striped directory after LFSCK"
4975 run_test 31g "Repair the corrupted slave LMV EA"
4978 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4981 echo "For some reason, the shard's name entry in the striped"
4982 echo "directory may be corrupted. The LFSCK should repair the"
4983 echo "bad shard's name entry."
4986 check_mount_and_prep
4988 echo "Inject failure stub on MDT0 to simulate the case that the"
4989 echo "first shard's name entry in the striped directory claims"
4990 echo "the same index as the second shard's name entry claims."
4992 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4993 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4994 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4995 error "(1) Fail to create striped directory"
4996 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4998 echo "Trigger namespace LFSCK to repair the shard's name entry"
4999 $START_NAMESPACE -r -A ||
5000 error "(2) Fail to start LFSCK for namespace"
5002 wait_all_targets_blocked namespace completed 3
5004 local repaired=$($SHOW_NAMESPACE |
5005 awk '/^dirent_repaired/ { print $2 }')
5006 [ $repaired -eq 1 ] ||
5007 error "(4) Fail to repair shard's name entry: $repaired"
5009 umount_client $MOUNT || error "(5) umount failed"
5010 mount_client $MOUNT || error "(6) mount failed"
5012 touch $DIR/$tdir/striped_dir/foo ||
5013 error "(7) Fail to touch file after the LFSCK"
5015 rm -f $DIR/$tdir/striped_dir/foo ||
5016 error "(8) Fail to unlink file after the LFSCK"
5018 rmdir $DIR/$tdir/striped_dir ||
5019 error "(9) Fail to remove the striped directory after LFSCK"
5021 run_test 31h "Repair the corrupted shard's name entry"
5026 umount_client $MOUNT
5028 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
5029 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5030 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
5032 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5033 [ "$STATUS" == "scanning-phase1" ] ||
5034 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
5037 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
5039 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5043 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5045 run_test 32 "stop LFSCK when some OST failed"
5051 $START_LAYOUT --dryrun -o -r ||
5052 error "(1) Fail to start layout LFSCK"
5053 wait_all_targets_blocked layout completed 2
5055 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5056 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5057 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5059 $START_NAMESPACE -e abort -A -r ||
5060 error "(4) Fail to start namespace LFSCK"
5061 wait_all_targets_blocked namespace completed 5
5063 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5064 [ "$PARAMS" == "failout,all_targets" ] ||
5065 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5067 run_test 33 "check LFSCK paramters"
5071 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5072 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5073 skip "Only valid for ZFS backend" && return
5077 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5078 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5079 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5080 error "(1) Fail to create $DIR/$tdir/dummy"
5082 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5083 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5084 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5085 mdd.${MDT_DEV}.lfsck_namespace |
5086 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5088 error "(3) unexpected status"
5091 local repaired=$($SHOW_NAMESPACE |
5092 awk '/^dirent_repaired/ { print $2 }')
5093 [ $repaired -eq 1 ] ||
5094 error "(4) Fail to repair the lost agent object: $repaired"
5096 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5097 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5098 mdd.${MDT_DEV}.lfsck_namespace |
5099 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5101 error "(6) unexpected status"
5104 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5105 [ $repaired -eq 0 ] ||
5106 error "(7) Unexpected repairing: $repaired"
5108 run_test 34 "LFSCK can rebuild the lost agent object"
5112 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5116 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5117 do_facet mds2 $LCTL set_param fail_loc=0x1631
5118 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5119 error "(1) Fail to create $DIR/$tdir/dummy"
5122 do_facet mds2 $LCTL set_param fail_loc=0
5123 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5124 wait_update_facet mds2 "$LCTL get_param -n \
5125 mdd.$(facet_svc mds2).lfsck_namespace |
5126 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5127 error "(3) MDS${k} is not the expected 'completed'"
5129 local repaired=$(do_facet mds2 $LCTL get_param -n \
5130 mdd.$(facet_svc mds2).lfsck_namespace |
5131 awk '/^agent_entries_repaired/ { print $2 }')
5132 [ $repaired -eq 1 ] ||
5133 error "(4) Fail to repair the lost agent entry: $repaired"
5135 echo "stopall to cleanup object cache"
5138 setupall > /dev/null
5140 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5141 wait_update_facet mds2 "$LCTL get_param -n \
5142 mdd.$(facet_svc mds2).lfsck_namespace |
5143 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5144 error "(6) MDS${k} is not the expected 'completed'"
5146 repaired=$(do_facet mds2 $LCTL get_param -n \
5147 mdd.$(facet_svc mds2).lfsck_namespace |
5148 awk '/^agent_entries_repaired/ { print $2 }')
5149 [ $repaired -eq 0 ] ||
5150 error "(7) Unexpected repairing: $repaired"
5152 run_test 35 "LFSCK can rebuild the lost agent entry"
5154 # restore MDS/OST size
5155 MDSSIZE=${SAVED_MDSSIZE}
5156 OSTSIZE=${SAVED_OSTSIZE}
5157 OSTCOUNT=${SAVED_OSTCOUNT}
5159 # cleanup the system at last
5160 REFORMAT="yes" cleanup_and_setup_lustre
5163 check_and_cleanup_lustre