3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 SAVED_MDSSIZE=${MDSSIZE}
28 SAVED_OSTSIZE=${OSTSIZE}
29 SAVED_OSTCOUNT=${OSTCOUNT}
30 # use small MDS + OST size to speed formatting time
31 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
34 # no need too many OSTs, to reduce the format/start/stop overhead
35 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
37 # build up a clean test environment.
41 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
42 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
45 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
46 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
48 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
49 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
51 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
54 # DNE does not support striped directory on zfs-based backend yet.
55 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
56 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
60 MDT_DEV="${FSNAME}-MDT0000"
61 OST_DEV="${FSNAME}-OST0000"
62 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
63 START_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
65 START_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
67 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
68 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
69 SHOW_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
71 SHOW_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
73 SHOW_LAYOUT_ON_OST="do_facet ost1 \
74 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
75 MOUNT_OPTS_SCRUB="-o user_xattr"
76 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
85 echo "preparing... $nfiles * $ndirs files will be created $(date)."
86 if [ ! -z $igif ]; then
87 #define OBD_FAIL_FID_IGIF 0x1504
88 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
91 cp $LUSTRE/tests/*.sh $DIR/$tdir/
92 if [ $ndirs -gt 0 ]; then
93 createmany -d $DIR/$tdir/d $ndirs
94 createmany -m $DIR/$tdir/f $ndirs
95 if [ $nfiles -gt 0 ]; then
96 for ((i = 0; i < $ndirs; i++)); do
97 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
98 /dev/null || error "createmany $nfiles"
101 createmany -d $DIR/$tdir/e $ndirs
104 if [ ! -z $igif ]; then
105 touch $DIR/$tdir/dummy
106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
109 echo "prepared $(date)."
112 run_e2fsck_on_mdt0() {
113 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
115 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
116 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
118 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
119 error "(2) Detected inconsistency on MDT0"
121 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
122 error "(3) Fail to start MDT0"
128 #define OBD_FAIL_LFSCK_DELAY1 0x1600
129 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
130 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
132 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
134 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
135 [ "$STATUS" == "scanning-phase1" ] ||
136 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
138 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
140 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
141 [ "$STATUS" == "stopped" ] ||
142 error "(6) Expect 'stopped', but got '$STATUS'"
144 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
146 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
147 [ "$STATUS" == "scanning-phase1" ] ||
148 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
151 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
152 mdd.${MDT_DEV}.lfsck_namespace |
153 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
155 error "(9) unexpected status"
158 local repaired=$($SHOW_NAMESPACE |
159 awk '/^updated_phase1/ { print $2 }')
160 [ $repaired -eq 0 ] ||
161 error "(10) Expect nothing to be repaired, but got: $repaired"
163 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
164 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
165 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
166 mdd.${MDT_DEV}.lfsck_namespace |
167 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
169 error "(12) unexpected status"
172 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
173 [ $((scanned1 + 1)) -eq $scanned2 ] ||
174 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
176 echo "stopall, should NOT crash LU-3649"
177 stopall || error "(14) Fail to stopall"
179 run_test 0 "Control LFSCK manually"
182 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
183 skip "OI Scrub not implemented for ZFS" && return
187 #define OBD_FAIL_FID_INDIR 0x1501
188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
189 touch $DIR/$tdir/dummy
191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
193 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
194 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
195 mdd.${MDT_DEV}.lfsck_namespace |
196 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
198 error "(4) unexpected status"
201 local repaired=$($SHOW_NAMESPACE |
202 awk '/^dirent_repaired/ { print $2 }')
203 # for interop with old server
204 [ -z "$repaired" ] &&
205 repaired=$($SHOW_NAMESPACE |
206 awk '/^updated_phase1/ { print $2 }')
208 [ $repaired -eq 1 ] ||
209 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
213 mount_client $MOUNT || error "(6) Fail to start client!"
215 #define OBD_FAIL_FID_LOOKUP 0x1505
216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
217 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
221 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
225 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
226 skip "OI Scrub not implemented for ZFS" && return
230 #define OBD_FAIL_FID_INLMA 0x1502
231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
232 touch $DIR/$tdir/dummy
234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
236 #define OBD_FAIL_FID_NOLMA 0x1506
237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
238 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
239 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
240 mdd.${MDT_DEV}.lfsck_namespace |
241 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
243 error "(4) unexpected status"
246 local repaired=$($SHOW_NAMESPACE |
247 awk '/^dirent_repaired/ { print $2 }')
248 # for interop with old server
249 [ -z "$repaired" ] &&
250 repaired=$($SHOW_NAMESPACE |
251 awk '/^updated_phase1/ { print $2 }')
253 [ $repaired -eq 1 ] ||
254 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
259 mount_client $MOUNT || error "(6) Fail to start client!"
261 #define OBD_FAIL_FID_LOOKUP 0x1505
262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
263 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
267 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
272 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
273 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
274 touch $DIR/$tdir/dummy
276 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
278 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
279 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
280 mdd.${MDT_DEV}.lfsck_namespace |
281 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
283 error "(4) unexpected status"
286 local repaired=$($SHOW_NAMESPACE |
287 awk '/^linkea_repaired/ { print $2 }')
288 # for interop with old server
289 [ -z "$repaired" ] &&
290 repaired=$($SHOW_NAMESPACE |
291 awk '/^updated_phase2/ { print $2 }')
293 [ $repaired -eq 1 ] ||
294 error "(5) Fail to repair crashed linkEA: $repaired"
298 mount_client $MOUNT || error "(6) Fail to start client!"
300 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
301 error "(7) Fail to stat $DIR/$tdir/dummy"
303 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
304 local dummyname=$($LFS fid2path $DIR $dummyfid)
305 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
306 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
308 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
314 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
315 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
316 touch $DIR/$tdir/dummy
318 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
320 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
321 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
322 mdd.${MDT_DEV}.lfsck_namespace |
323 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
325 error "(4) unexpected status"
328 local repaired=$($SHOW_NAMESPACE |
329 awk '/^updated_phase2/ { print $2 }')
330 [ $repaired -eq 1 ] ||
331 error "(5) Fail to repair crashed linkEA: $repaired"
335 mount_client $MOUNT || error "(6) Fail to start client!"
337 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
338 error "(7) Fail to stat $DIR/$tdir/dummy"
340 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
341 local dummyname=$($LFS fid2path $DIR $dummyfid)
342 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
343 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
345 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
351 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
353 touch $DIR/$tdir/dummy
355 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
357 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
358 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
359 mdd.${MDT_DEV}.lfsck_namespace |
360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
362 error "(4) unexpected status"
365 local repaired=$($SHOW_NAMESPACE |
366 awk '/^updated_phase2/ { print $2 }')
367 [ $repaired -eq 1 ] ||
368 error "(5) Fail to repair crashed linkEA: $repaired"
372 mount_client $MOUNT || error "(6) Fail to start client!"
374 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
375 error "(7) Fail to stat $DIR/$tdir/dummy"
377 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
378 local dummyname=$($LFS fid2path $DIR $dummyfid)
379 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
380 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
382 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
388 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
390 touch $DIR/$tdir/dummy
392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
394 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
395 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
396 mdd.${MDT_DEV}.lfsck_namespace |
397 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
399 error "(4) unexpected status"
402 local repaired=$($SHOW_NAMESPACE |
403 awk '/^linkea_repaired/ { print $2 }')
404 [ $repaired -eq 1 ] ||
405 error "(5) Fail to repair crashed linkEA: $repaired"
409 mount_client $MOUNT || error "(6) Fail to start client!"
411 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
412 error "(7) Fail to stat $DIR/$tdir/dummy"
414 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
415 local dummyname=$($LFS fid2path $DIR $dummyfid)
416 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
417 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
419 run_test 2d "LFSCK can recover the missing linkEA entry"
423 [ $MDSCOUNT -lt 2 ] &&
424 skip "We need at least 2 MDSes for this test" && return
428 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
430 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
431 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
432 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
433 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
435 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
436 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
437 mdd.${MDT_DEV}.lfsck_namespace |
438 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
440 error "(4) unexpected status"
443 local repaired=$($SHOW_NAMESPACE |
444 awk '/^linkea_repaired/ { print $2 }')
445 [ $repaired -eq 1 ] ||
446 error "(5) Fail to repair crashed linkEA: $repaired"
448 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
449 local name=$($LFS fid2path $DIR $fid)
450 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
451 error "(6) Fail to repair linkEA: $fid $name"
453 run_test 2e "namespace LFSCK can verify remote object linkEA"
459 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
460 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
461 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
463 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
464 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
465 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
467 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
468 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
469 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
471 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
472 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
473 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
475 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
477 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
478 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
479 mdd.${MDT_DEV}.lfsck_namespace |
480 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
482 error "(10) unexpected status"
485 local checked=$($SHOW_NAMESPACE |
486 awk '/^checked_phase2/ { print $2 }')
487 [ $checked -ge 4 ] ||
488 error "(11) Fail to check multiple-linked object: $checked"
490 local repaired=$($SHOW_NAMESPACE |
491 awk '/^multiple_linked_repaired/ { print $2 }')
492 [ $repaired -ge 2 ] ||
493 error "(12) Fail to repair multiple-linked object: $repaired"
495 run_test 3 "LFSCK can verify multiple-linked objects"
499 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
500 skip "OI Scrub not implemented for ZFS" && return
503 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
504 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
506 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
507 echo "start $SINGLEMDS with disabling OI scrub"
508 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
509 error "(2) Fail to start MDS!"
511 #define OBD_FAIL_LFSCK_DELAY2 0x1601
512 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
513 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
514 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
515 mdd.${MDT_DEV}.lfsck_namespace |
516 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
518 error "(5) unexpected status"
521 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
522 [ "$STATUS" == "scanning-phase1" ] ||
523 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
525 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
526 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
527 mdd.${MDT_DEV}.lfsck_namespace |
528 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
530 error "(7) unexpected status"
533 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
534 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
536 local repaired=$($SHOW_NAMESPACE |
537 awk '/^dirent_repaired/ { print $2 }')
538 # for interop with old server
539 [ -z "$repaired" ] &&
540 repaired=$($SHOW_NAMESPACE |
541 awk '/^updated_phase1/ { print $2 }')
543 [ $repaired -ge 9 ] ||
544 error "(9) Fail to re-generate FID-in-dirent: $repaired"
548 mount_client $MOUNT || error "(10) Fail to start client!"
550 #define OBD_FAIL_FID_LOOKUP 0x1505
551 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
552 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
555 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
559 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
560 skip "OI Scrub not implemented for ZFS" && return
563 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
564 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
566 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
567 echo "start $SINGLEMDS with disabling OI scrub"
568 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
569 error "(2) Fail to start MDS!"
571 #define OBD_FAIL_LFSCK_DELAY2 0x1601
572 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
573 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
574 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
575 mdd.${MDT_DEV}.lfsck_namespace |
576 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
578 error "(5) unexpected status"
581 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
582 [ "$STATUS" == "scanning-phase1" ] ||
583 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
585 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
586 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
587 mdd.${MDT_DEV}.lfsck_namespace |
588 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
590 error "(7) unexpected status"
593 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
594 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
596 local repaired=$($SHOW_NAMESPACE |
597 awk '/^dirent_repaired/ { print $2 }')
598 # for interop with old server
599 [ -z "$repaired" ] &&
600 repaired=$($SHOW_NAMESPACE |
601 awk '/^updated_phase1/ { print $2 }')
603 [ $repaired -ge 2 ] ||
604 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
608 mount_client $MOUNT || error "(10) Fail to start client!"
610 #define OBD_FAIL_FID_LOOKUP 0x1505
611 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
612 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
614 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
617 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
618 local dummyname=$($LFS fid2path $DIR $dummyfid)
619 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
620 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
622 run_test 5 "LFSCK can handle IGIF object upgrading"
627 #define OBD_FAIL_LFSCK_DELAY1 0x1600
628 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
629 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
631 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
632 [ "$STATUS" == "scanning-phase1" ] ||
633 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
635 # Sleep 3 sec to guarantee at least one object processed by LFSCK
637 # Fail the LFSCK to guarantee there is at least one checkpoint
638 #define OBD_FAIL_LFSCK_FATAL1 0x1608
639 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
640 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
641 mdd.${MDT_DEV}.lfsck_namespace |
642 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
644 error "(4) unexpected status"
647 local POS0=$($SHOW_NAMESPACE |
648 awk '/^last_checkpoint_position/ { print $2 }' |
651 #define OBD_FAIL_LFSCK_DELAY1 0x1600
652 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
653 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
655 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
656 [ "$STATUS" == "scanning-phase1" ] ||
657 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
659 local POS1=$($SHOW_NAMESPACE |
660 awk '/^latest_start_position/ { print $2 }' |
662 [[ $POS0 -lt $POS1 ]] ||
663 error "(7) Expect larger than: $POS0, but got $POS1"
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
667 mdd.${MDT_DEV}.lfsck_namespace |
668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
670 error "(8) unexpected status"
673 run_test 6a "LFSCK resumes from last checkpoint (1)"
678 #define OBD_FAIL_LFSCK_DELAY2 0x1601
679 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
680 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
682 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
683 [ "$STATUS" == "scanning-phase1" ] ||
684 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
686 # Sleep 5 sec to guarantee that we are in the directory scanning
688 # Fail the LFSCK to guarantee there is at least one checkpoint
689 #define OBD_FAIL_LFSCK_FATAL2 0x1609
690 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
691 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
692 mdd.${MDT_DEV}.lfsck_namespace |
693 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
695 error "(4) unexpected status"
698 local O_POS0=$($SHOW_NAMESPACE |
699 awk '/^last_checkpoint_position/ { print $2 }' |
702 local D_POS0=$($SHOW_NAMESPACE |
703 awk '/^last_checkpoint_position/ { print $4 }')
705 #define OBD_FAIL_LFSCK_DELAY2 0x1601
706 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
707 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
709 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
710 [ "$STATUS" == "scanning-phase1" ] ||
711 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
713 local O_POS1=$($SHOW_NAMESPACE |
714 awk '/^latest_start_position/ { print $2 }' |
716 local D_POS1=$($SHOW_NAMESPACE |
717 awk '/^latest_start_position/ { print $4 }')
719 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
720 [[ $O_POS0 -lt $O_POS1 ]] ||
721 error "(7.1) $O_POS1 is not larger than $O_POS0"
723 [[ $D_POS0 -lt $D_POS1 ]] ||
724 error "(7.2) $D_POS1 is not larger than $D_POS0"
727 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
728 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
729 mdd.${MDT_DEV}.lfsck_namespace |
730 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
732 error "(8) unexpected status"
735 run_test 6b "LFSCK resumes from last checkpoint (2)"
742 #define OBD_FAIL_LFSCK_DELAY2 0x1601
743 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
744 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
746 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
747 [ "$STATUS" == "scanning-phase1" ] ||
748 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
750 # Sleep 3 sec to guarantee at least one object processed by LFSCK
752 echo "stop $SINGLEMDS"
753 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
756 echo "start $SINGLEMDS"
757 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
758 error "(5) Fail to start MDS!"
760 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
761 mdd.${MDT_DEV}.lfsck_namespace |
762 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
764 error "(6) unexpected status"
767 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
773 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
774 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
775 for ((i = 0; i < 20; i++)); do
776 touch $DIR/$tdir/dummy${i}
779 #define OBD_FAIL_LFSCK_DELAY3 0x1602
780 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
781 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
782 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
783 mdd.${MDT_DEV}.lfsck_namespace |
784 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
786 error "(4) unexpected status"
790 echo "stop $SINGLEMDS"
791 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
794 echo "start $SINGLEMDS"
795 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
796 error "(6) Fail to start MDS!"
798 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
799 mdd.${MDT_DEV}.lfsck_namespace |
800 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
802 error "(7) unexpected status"
805 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
810 formatall > /dev/null
816 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
817 [ "$STATUS" == "init" ] ||
818 error "(2) Expect 'init', but got '$STATUS'"
820 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
822 mkdir $DIR/$tdir/crashed
824 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
825 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
826 for ((i = 0; i < 5; i++)); do
827 touch $DIR/$tdir/dummy${i}
830 umount_client $MOUNT || error "(3) Fail to stop client!"
832 #define OBD_FAIL_LFSCK_DELAY2 0x1601
833 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
834 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
836 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
837 [ "$STATUS" == "scanning-phase1" ] ||
838 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
840 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
842 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
843 [ "$STATUS" == "stopped" ] ||
844 error "(7) Expect 'stopped', but got '$STATUS'"
846 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "scanning-phase1" ] ||
850 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
852 #define OBD_FAIL_LFSCK_FATAL2 0x1609
853 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
854 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
855 mdd.${MDT_DEV}.lfsck_namespace |
856 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
858 error "(10) unexpected status"
861 #define OBD_FAIL_LFSCK_DELAY1 0x1600
862 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
863 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
865 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
866 [ "$STATUS" == "scanning-phase1" ] ||
867 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
869 #define OBD_FAIL_LFSCK_CRASH 0x160a
870 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
873 echo "stop $SINGLEMDS"
874 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
876 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
877 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
879 echo "start $SINGLEMDS"
880 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
881 error "(14) Fail to start MDS!"
883 local timeout=$(max_recovery_time)
886 while [ $timer -lt $timeout ]; do
887 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
888 mdt.${MDT_DEV}.recovery_status |
889 awk '/^status/ { print \\\$2 }'")
890 [ "$STATUS" != "RECOVERING" ] && break;
895 [ $timer != $timeout ] ||
896 error "(14.1) recovery timeout"
898 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
899 [ "$STATUS" == "crashed" ] ||
900 error "(15) Expect 'crashed', but got '$STATUS'"
902 #define OBD_FAIL_LFSCK_DELAY2 0x1601
903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
904 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
906 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
907 [ "$STATUS" == "scanning-phase1" ] ||
908 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
910 echo "stop $SINGLEMDS"
911 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
913 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
914 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
916 echo "start $SINGLEMDS"
917 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
918 error "(19) Fail to start MDS!"
921 while [ $timer -lt $timeout ]; do
922 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
923 mdt.${MDT_DEV}.recovery_status |
924 awk '/^status/ { print \\\$2 }'")
925 [ "$STATUS" != "RECOVERING" ] && break;
930 [ $timer != $timeout ] ||
931 error "(19.1) recovery timeout"
933 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
934 [ "$STATUS" == "paused" ] ||
935 error "(20) Expect 'paused', but got '$STATUS'"
937 #define OBD_FAIL_LFSCK_DELAY3 0x1602
938 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
940 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
941 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
942 mdd.${MDT_DEV}.lfsck_namespace |
943 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
945 error "(22) unexpected status"
948 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
949 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
950 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
952 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
953 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
954 mdd.${MDT_DEV}.lfsck_namespace |
955 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
957 error "(24) unexpected status"
960 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
961 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
963 run_test 8 "LFSCK state machine"
966 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
967 skip "Testing on UP system, the speed may be inaccurate."
971 [[ $server_version -ge $(version_code 2.7.50) ]] ||
972 { skip "Need MDS version >= 2.7.50"; return; }
975 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
976 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
977 createmany -o $DIR/$tdir/lfsck/f 5000
979 local BASE_SPEED1=100
981 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
984 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
985 [ "$STATUS" == "scanning-phase1" ] ||
986 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
988 local SPEED=$($SHOW_LAYOUT |
989 awk '/^average_speed_phase1/ { print $2 }')
991 # There may be time error, normally it should be less than 2 seconds.
992 # We allow another 20% schedule error.
994 # MAX_MARGIN = 1.2 = 12 / 10
995 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
996 RUN_TIME1 * 12 / 10))
997 [ $SPEED -lt $MAX_SPEED ] ||
998 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1000 # adjust speed limit
1001 local BASE_SPEED2=300
1003 do_facet $SINGLEMDS \
1004 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1007 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1008 # MIN_MARGIN = 0.8 = 8 / 10
1009 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1010 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1011 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1012 [ $SPEED -gt $MIN_SPEED ] || {
1013 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1014 error_ignore LU-5624 \
1015 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1018 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1022 # MAX_MARGIN = 1.2 = 12 / 10
1023 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1024 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1025 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1026 [ $SPEED -lt $MAX_SPEED ] ||
1027 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1029 do_facet $SINGLEMDS \
1030 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1032 wait_update_facet $SINGLEMDS \
1033 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1034 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1035 error "(7) Failed to get expected 'completed'"
1037 run_test 9a "LFSCK speed control (1)"
1040 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1041 skip "Testing on UP system, the speed may be inaccurate."
1045 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1046 { skip "Need MDS version >= 2.7.50"; return; }
1050 echo "Preparing another 50 * 50 files (with error) at $(date)."
1051 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1052 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1053 createmany -d $DIR/$tdir/d 50
1054 createmany -m $DIR/$tdir/f 50
1055 for ((i = 0; i < 50; i++)); do
1056 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1059 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1060 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1061 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1062 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1063 mdd.${MDT_DEV}.lfsck_namespace |
1064 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1066 error "(5) unexpected status"
1069 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1070 echo "Prepared at $(date)."
1072 local BASE_SPEED1=50
1074 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1077 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1078 [ "$STATUS" == "scanning-phase2" ] ||
1079 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1081 local SPEED=$($SHOW_NAMESPACE |
1082 awk '/^average_speed_phase2/ { print $2 }')
1083 # There may be time error, normally it should be less than 2 seconds.
1084 # We allow another 20% schedule error.
1086 # MAX_MARGIN = 1.2 = 12 / 10
1087 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1088 RUN_TIME1 * 12 / 10))
1089 [ $SPEED -lt $MAX_SPEED ] ||
1090 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1092 # adjust speed limit
1093 local BASE_SPEED2=150
1095 do_facet $SINGLEMDS \
1096 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1099 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1100 # MIN_MARGIN = 0.8 = 8 / 10
1101 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1102 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1103 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1104 [ $SPEED -gt $MIN_SPEED ] || {
1105 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1106 error_ignore LU-5624 \
1107 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1110 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1114 # MAX_MARGIN = 1.2 = 12 / 10
1115 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1116 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1117 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1118 [ $SPEED -lt $MAX_SPEED ] ||
1119 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1121 do_facet $SINGLEMDS \
1122 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1123 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1124 mdd.${MDT_DEV}.lfsck_namespace |
1125 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1127 error "(11) unexpected status"
1130 run_test 9b "LFSCK speed control (2)"
1134 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1135 skip "lookup(..)/linkea on ZFS issue" && return
1139 echo "Preparing more files with error at $(date)."
1140 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1143 for ((i = 0; i < 1000; i = $((i+2)))); do
1144 mkdir -p $DIR/$tdir/d${i}
1145 touch $DIR/$tdir/f${i}
1146 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1149 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1152 for ((i = 1; i < 1000; i = $((i+2)))); do
1153 mkdir -p $DIR/$tdir/d${i}
1154 touch $DIR/$tdir/f${i}
1155 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1158 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1159 echo "Prepared at $(date)."
1161 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1163 umount_client $MOUNT
1164 mount_client $MOUNT || error "(3) Fail to start client!"
1166 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1169 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1170 [ "$STATUS" == "scanning-phase1" ] ||
1171 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1173 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1175 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1177 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1179 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1181 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1183 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1185 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1187 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1188 error "(14) Fail to softlink!"
1190 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1191 [ "$STATUS" == "scanning-phase1" ] ||
1192 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1194 do_facet $SINGLEMDS \
1195 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1196 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1197 mdd.${MDT_DEV}.lfsck_namespace |
1198 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1200 error "(16) unexpected status"
1203 run_test 10 "System is available during LFSCK scanning"
1206 ost_remove_lastid() {
1209 local rcmd="do_facet ost${ost}"
1211 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1213 # step 1: local mount
1214 mount_fstype ost${ost} || return 1
1215 # step 2: remove the specified LAST_ID
1216 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1218 unmount_fstype ost${ost} || return 2
1222 check_mount_and_prep
1223 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1224 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1229 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1231 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1232 error "(2) Fail to start ost1"
1234 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1235 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1237 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1238 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1240 wait_update_facet ost1 "$LCTL get_param -n \
1241 obdfilter.${OST_DEV}.lfsck_layout |
1242 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1244 error "(5) unexpected status"
1247 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1249 wait_update_facet ost1 "$LCTL get_param -n \
1250 obdfilter.${OST_DEV}.lfsck_layout |
1251 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1253 error "(6) unexpected status"
1256 echo "the LAST_ID(s) should have been rebuilt"
1257 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1258 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1260 run_test 11a "LFSCK can rebuild lost last_id"
1263 check_mount_and_prep
1264 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1266 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1267 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1268 do_facet ost1 $LCTL set_param fail_loc=0x160d
1270 local count=$(precreated_ost_obj_count 0 0)
1272 createmany -o $DIR/$tdir/f $((count + 32))
1274 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1275 local seq=$(do_facet mds1 $LCTL get_param -n \
1276 osp.${proc_path}.prealloc_last_seq)
1277 local lastid1=$(do_facet ost1 "lctl get_param -n \
1278 obdfilter.${ost1_svc}.last_id" | grep $seq |
1279 awk -F: '{ print $2 }')
1281 umount_client $MOUNT
1282 stop ost1 || error "(1) Fail to stop ost1"
1284 #define OBD_FAIL_OST_ENOSPC 0x215
1285 do_facet ost1 $LCTL set_param fail_loc=0x215
1287 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1288 error "(2) Fail to start ost1"
1290 for ((i = 0; i < 60; i++)); do
1291 lastid2=$(do_facet ost1 "lctl get_param -n \
1292 obdfilter.${ost1_svc}.last_id" | grep $seq |
1293 awk -F: '{ print $2 }')
1294 [ ! -z $lastid2 ] && break;
1298 echo "the on-disk LAST_ID should be smaller than the expected one"
1299 [ $lastid1 -gt $lastid2 ] ||
1300 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1302 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1303 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1305 wait_update_facet ost1 "$LCTL get_param -n \
1306 obdfilter.${OST_DEV}.lfsck_layout |
1307 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1309 error "(6) unexpected status"
1312 stop ost1 || error "(7) Fail to stop ost1"
1314 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1315 error "(8) Fail to start ost1"
1317 echo "the on-disk LAST_ID should have been rebuilt"
1318 wait_update_facet ost1 "$LCTL get_param -n \
1319 obdfilter.${ost1_svc}.last_id | grep $seq |
1320 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1321 do_facet ost1 $LCTL get_param -n \
1322 obdfilter.${ost1_svc}.last_id
1323 error "(9) expect lastid1 $seq:$lastid1"
1326 do_facet ost1 $LCTL set_param fail_loc=0
1327 stopall || error "(10) Fail to stopall"
1329 run_test 11b "LFSCK can rebuild crashed last_id"
1332 [ $MDSCOUNT -lt 2 ] &&
1333 skip "We need at least 2 MDSes for test_12" && return
1335 check_mount_and_prep
1336 for k in $(seq $MDSCOUNT); do
1337 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1338 createmany -o $DIR/$tdir/${k}/f 100 ||
1339 error "(0) Fail to create 100 files."
1342 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1343 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1344 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1346 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1347 for k in $(seq $MDSCOUNT); do
1348 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1349 mdd.$(facet_svc mds${k}).lfsck_namespace |
1350 awk '/^status/ { print $2 }')
1351 [ "$STATUS" == "scanning-phase1" ] ||
1352 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1355 echo "Stop namespace LFSCK on all targets by single lctl command."
1356 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1357 error "(4) Fail to stop LFSCK on all devices!"
1359 echo "All the LFSCK targets should be in 'stopped' status."
1360 for k in $(seq $MDSCOUNT); do
1361 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1362 mdd.$(facet_svc mds${k}).lfsck_namespace |
1363 awk '/^status/ { print $2 }')
1364 [ "$STATUS" == "stopped" ] ||
1365 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1368 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1369 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1370 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1372 echo "All the LFSCK targets should be in 'completed' status."
1373 for k in $(seq $MDSCOUNT); do
1374 wait_update_facet mds${k} "$LCTL get_param -n \
1375 mdd.$(facet_svc mds${k}).lfsck_namespace |
1376 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1377 error "(7) MDS${k} is not the expected 'completed'"
1380 start_full_debug_logging
1382 echo "Start layout LFSCK on all targets by single command (-s 1)."
1383 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1384 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1386 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1387 for k in $(seq $MDSCOUNT); do
1388 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1389 mdd.$(facet_svc mds${k}).lfsck_layout |
1390 awk '/^status/ { print $2 }')
1391 [ "$STATUS" == "scanning-phase1" ] ||
1392 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1395 echo "Stop layout LFSCK on all targets by single lctl command."
1396 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1397 error "(10) Fail to stop LFSCK on all devices!"
1399 echo "All the LFSCK targets should be in 'stopped' status."
1400 for k in $(seq $MDSCOUNT); do
1401 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1402 mdd.$(facet_svc mds${k}).lfsck_layout |
1403 awk '/^status/ { print $2 }')
1404 [ "$STATUS" == "stopped" ] ||
1405 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1408 for k in $(seq $OSTCOUNT); do
1409 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1410 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1411 awk '/^status/ { print $2 }')
1412 [ "$STATUS" == "stopped" ] ||
1413 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1416 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1417 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1418 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1420 echo "All the LFSCK targets should be in 'completed' status."
1421 for k in $(seq $MDSCOUNT); do
1422 # The LFSCK status query internal is 30 seconds. For the case
1423 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1424 # time to guarantee the status sync up.
1425 wait_update_facet mds${k} "$LCTL get_param -n \
1426 mdd.$(facet_svc mds${k}).lfsck_layout |
1427 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1428 error "(14) MDS${k} is not the expected 'completed'"
1431 stop_full_debug_logging
1433 run_test 12 "single command to trigger LFSCK on all devices"
1437 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1438 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1439 echo "MDT-object FID."
1442 check_mount_and_prep
1444 echo "Inject failure stub to simulate bad lmm_oi"
1445 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1446 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1447 createmany -o $DIR/$tdir/f 32
1448 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1450 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1451 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1453 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1454 mdd.${MDT_DEV}.lfsck_layout |
1455 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1457 error "(2) unexpected status"
1460 local repaired=$($SHOW_LAYOUT |
1461 awk '/^repaired_others/ { print $2 }')
1462 [ $repaired -eq 32 ] ||
1463 error "(3) Fail to repair crashed lmm_oi: $repaired"
1465 run_test 13 "LFSCK can repair crashed lmm_oi"
1469 echo "The OST-object referenced by the MDT-object should be there;"
1470 echo "otherwise, the LFSCK should re-create the missing OST-object."
1473 check_mount_and_prep
1474 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1476 echo "Inject failure stub to simulate dangling referenced MDT-object"
1477 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1478 do_facet ost1 $LCTL set_param fail_loc=0x1610
1479 local count=$(precreated_ost_obj_count 0 0)
1481 createmany -o $DIR/$tdir/f $((count + 31))
1482 touch $DIR/$tdir/guard
1483 do_facet ost1 $LCTL set_param fail_loc=0
1485 start_full_debug_logging
1487 # exhaust other pre-created dangling cases
1488 count=$(precreated_ost_obj_count 0 0)
1489 createmany -o $DIR/$tdir/a $count ||
1490 error "(0) Fail to create $count files."
1492 echo "'ls' should fail because of dangling referenced MDT-object"
1493 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1495 echo "Trigger layout LFSCK to find out dangling reference"
1496 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1498 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1499 mdd.${MDT_DEV}.lfsck_layout |
1500 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1502 error "(3) unexpected status"
1505 local repaired=$($SHOW_LAYOUT |
1506 awk '/^repaired_dangling/ { print $2 }')
1507 [ $repaired -ge 32 ] ||
1508 error "(4) Fail to repair dangling reference: $repaired"
1510 echo "'stat' should fail because of not repair dangling by default"
1511 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1513 echo "Trigger layout LFSCK to repair dangling reference"
1514 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1516 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1517 mdd.${MDT_DEV}.lfsck_layout |
1518 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1520 error "(7) unexpected status"
1523 # There may be some async LFSCK updates in processing, wait for
1524 # a while until the target reparation has been done. LU-4970.
1526 echo "'stat' should success after layout LFSCK repairing"
1527 wait_update_facet client "stat $DIR/$tdir/guard |
1528 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1529 stat $DIR/$tdir/guard
1531 error "(8) unexpected size"
1534 repaired=$($SHOW_LAYOUT |
1535 awk '/^repaired_dangling/ { print $2 }')
1536 [ $repaired -ge 32 ] ||
1537 error "(9) Fail to repair dangling reference: $repaired"
1539 stop_full_debug_logging
1541 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1545 echo "If the OST-object referenced by the MDT-object back points"
1546 echo "to some non-exist MDT-object, then the LFSCK should repair"
1547 echo "the OST-object to back point to the right MDT-object."
1550 check_mount_and_prep
1551 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1553 echo "Inject failure stub to make the OST-object to back point to"
1554 echo "non-exist MDT-object."
1555 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1557 do_facet ost1 $LCTL set_param fail_loc=0x1611
1558 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1559 cancel_lru_locks osc
1560 do_facet ost1 $LCTL set_param fail_loc=0
1562 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1563 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1565 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1566 mdd.${MDT_DEV}.lfsck_layout |
1567 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1569 error "(2) unexpected status"
1572 local repaired=$($SHOW_LAYOUT |
1573 awk '/^repaired_unmatched_pair/ { print $2 }')
1574 [ $repaired -eq 1 ] ||
1575 error "(3) Fail to repair unmatched pair: $repaired"
1577 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1581 echo "If the OST-object referenced by the MDT-object back points"
1582 echo "to other MDT-object that doesn't recognize the OST-object,"
1583 echo "then the LFSCK should repair it to back point to the right"
1584 echo "MDT-object (the first one)."
1587 check_mount_and_prep
1588 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1589 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1590 cancel_lru_locks osc
1592 echo "Inject failure stub to make the OST-object to back point to"
1593 echo "other MDT-object"
1595 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1596 do_facet ost1 $LCTL set_param fail_loc=0x1612
1597 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1598 cancel_lru_locks osc
1599 do_facet ost1 $LCTL set_param fail_loc=0
1601 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1602 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1605 mdd.${MDT_DEV}.lfsck_layout |
1606 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1608 error "(2) unexpected status"
1611 local repaired=$($SHOW_LAYOUT |
1612 awk '/^repaired_unmatched_pair/ { print $2 }')
1613 [ $repaired -eq 1 ] ||
1614 error "(3) Fail to repair unmatched pair: $repaired"
1616 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1619 [ $MDSCOUNT -lt 2 ] &&
1620 skip "We need at least 2 MDSes for this test" && return
1622 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1623 skip "Skip the test after 2.7.55 see LU-6437" && return
1626 echo "According to current metadata migration implementation,"
1627 echo "before the old MDT-object is removed, both the new MDT-object"
1628 echo "and old MDT-object will reference the same LOV layout. Then if"
1629 echo "the layout LFSCK finds the new MDT-object by race, it will"
1630 echo "regard related OST-object(s) as multiple referenced case, and"
1631 echo "will try to create new OST-object(s) for the new MDT-object."
1632 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1633 echo "MDT-object before confirm the multiple referenced case."
1636 check_mount_and_prep
1637 $LFS mkdir -i 1 $DIR/$tdir/a1
1638 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1639 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1640 cancel_lru_locks osc
1642 echo "Inject failure stub on MDT1 to delay the migration"
1644 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1645 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1646 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1647 $LFS migrate -m 0 $DIR/$tdir/a1 &
1650 echo "Trigger layout LFSCK to race with the migration"
1651 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1653 for k in $(seq $MDSCOUNT); do
1654 # The LFSCK status query internal is 30 seconds. For the case
1655 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1656 # time to guarantee the status sync up.
1657 wait_update_facet mds${k} "$LCTL get_param -n \
1658 mdd.$(facet_svc mds${k}).lfsck_layout |
1659 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1660 error "(2) MDS${k} is not the expected 'completed'"
1663 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1664 local repaired=$($SHOW_LAYOUT |
1665 awk '/^repaired_unmatched_pair/ { print $2 }')
1666 [ $repaired -eq 1 ] ||
1667 error "(3) Fail to repair unmatched pair: $repaired"
1669 repaired=$($SHOW_LAYOUT |
1670 awk '/^repaired_multiple_referenced/ { print $2 }')
1671 [ $repaired -eq 0 ] ||
1672 error "(4) Unexpectedly repaird multiple references: $repaired"
1674 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1678 echo "If the OST-object's owner information does not match the owner"
1679 echo "information stored in the MDT-object, then the LFSCK trust the"
1680 echo "MDT-object and update the OST-object's owner information."
1683 check_mount_and_prep
1684 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1685 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1686 cancel_lru_locks osc
1688 echo "Inject failure stub to skip OST-object owner changing"
1689 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1690 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1691 chown 1.1 $DIR/$tdir/f0
1692 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1694 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1697 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1699 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1700 mdd.${MDT_DEV}.lfsck_layout |
1701 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1703 error "(2) unexpected status"
1706 local repaired=$($SHOW_LAYOUT |
1707 awk '/^repaired_inconsistent_owner/ { print $2 }')
1708 [ $repaired -eq 1 ] ||
1709 error "(3) Fail to repair inconsistent owner: $repaired"
1711 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1715 echo "If more than one MDT-objects reference the same OST-object,"
1716 echo "and the OST-object only recognizes one MDT-object, then the"
1717 echo "LFSCK should create new OST-objects for such non-recognized"
1721 check_mount_and_prep
1722 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1724 echo "Inject failure stub to make two MDT-objects to refernce"
1725 echo "the OST-object"
1727 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1728 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1730 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1731 cancel_lru_locks osc
1733 createmany -o $DIR/$tdir/f 1
1735 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1737 cancel_lru_locks mdc
1738 cancel_lru_locks osc
1740 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1741 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1742 [ $size -eq 1048576 ] ||
1743 error "(1) f0 (wrong) size should be 1048576, but got $size"
1745 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1748 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1750 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1751 mdd.${MDT_DEV}.lfsck_layout |
1752 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1754 error "(3) unexpected status"
1757 local repaired=$($SHOW_LAYOUT |
1758 awk '/^repaired_multiple_referenced/ { print $2 }')
1759 [ $repaired -eq 1 ] ||
1760 error "(4) Fail to repair multiple references: $repaired"
1762 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1763 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1764 error "(5) Fail to write f0."
1765 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1766 [ $size -eq 1048576 ] ||
1767 error "(6) guard size should be 1048576, but got $size"
1769 run_test 17 "LFSCK can repair multiple references"
1771 $LCTL set_param debug=+cache > /dev/null
1775 echo "The target MDT-object is there, but related stripe information"
1776 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1777 echo "layout EA entries."
1780 check_mount_and_prep
1781 $LFS mkdir -i 0 $DIR/$tdir/a1
1782 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1783 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1785 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1787 $LFS path2fid $DIR/$tdir/a1/f1
1788 $LFS getstripe $DIR/$tdir/a1/f1
1790 if [ $MDSCOUNT -ge 2 ]; then
1791 $LFS mkdir -i 1 $DIR/$tdir/a2
1792 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1793 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1794 $LFS path2fid $DIR/$tdir/a2/f2
1795 $LFS getstripe $DIR/$tdir/a2/f2
1798 cancel_lru_locks osc
1800 echo "Inject failure, to make the MDT-object lost its layout EA"
1801 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1802 do_facet mds1 $LCTL set_param fail_loc=0x1615
1803 chown 1.1 $DIR/$tdir/a1/f1
1805 if [ $MDSCOUNT -ge 2 ]; then
1806 do_facet mds2 $LCTL set_param fail_loc=0x1615
1807 chown 1.1 $DIR/$tdir/a2/f2
1813 do_facet mds1 $LCTL set_param fail_loc=0
1814 if [ $MDSCOUNT -ge 2 ]; then
1815 do_facet mds2 $LCTL set_param fail_loc=0
1818 cancel_lru_locks mdc
1819 cancel_lru_locks osc
1821 echo "The file size should be incorrect since layout EA is lost"
1822 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1823 [ "$cur_size" != "$saved_size" ] ||
1824 error "(1) Expect incorrect file1 size"
1826 if [ $MDSCOUNT -ge 2 ]; then
1827 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1828 [ "$cur_size" != "$saved_size" ] ||
1829 error "(2) Expect incorrect file2 size"
1832 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1833 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1835 for k in $(seq $MDSCOUNT); do
1836 # The LFSCK status query internal is 30 seconds. For the case
1837 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1838 # time to guarantee the status sync up.
1839 wait_update_facet mds${k} "$LCTL get_param -n \
1840 mdd.$(facet_svc mds${k}).lfsck_layout |
1841 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1842 error "(4) MDS${k} is not the expected 'completed'"
1845 for k in $(seq $OSTCOUNT); do
1846 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1847 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1848 awk '/^status/ { print $2 }')
1849 [ "$cur_status" == "completed" ] ||
1850 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1853 local repaired=$(do_facet mds1 $LCTL get_param -n \
1854 mdd.$(facet_svc mds1).lfsck_layout |
1855 awk '/^repaired_orphan/ { print $2 }')
1856 [ $repaired -eq 1 ] ||
1857 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1859 if [ $MDSCOUNT -ge 2 ]; then
1860 repaired=$(do_facet mds2 $LCTL get_param -n \
1861 mdd.$(facet_svc mds2).lfsck_layout |
1862 awk '/^repaired_orphan/ { print $2 }')
1863 [ $repaired -eq 2 ] ||
1864 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1867 $LFS path2fid $DIR/$tdir/a1/f1
1868 $LFS getstripe $DIR/$tdir/a1/f1
1870 if [ $MDSCOUNT -ge 2 ]; then
1871 $LFS path2fid $DIR/$tdir/a2/f2
1872 $LFS getstripe $DIR/$tdir/a2/f2
1875 echo "The file size should be correct after layout LFSCK scanning"
1876 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1877 [ "$cur_size" == "$saved_size" ] ||
1878 error "(7) Expect file1 size $saved_size, but got $cur_size"
1880 if [ $MDSCOUNT -ge 2 ]; then
1881 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1882 [ "$cur_size" == "$saved_size" ] ||
1883 error "(8) Expect file2 size $saved_size, but got $cur_size"
1886 run_test 18a "Find out orphan OST-object and repair it (1)"
1890 echo "The target MDT-object is lost. The LFSCK should re-create the"
1891 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1892 echo "can move it back to normal namespace manually."
1895 check_mount_and_prep
1896 $LFS mkdir -i 0 $DIR/$tdir/a1
1897 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1898 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1899 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1900 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1902 $LFS getstripe $DIR/$tdir/a1/f1
1904 if [ $MDSCOUNT -ge 2 ]; then
1905 $LFS mkdir -i 1 $DIR/$tdir/a2
1906 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1907 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1908 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1910 $LFS getstripe $DIR/$tdir/a2/f2
1913 cancel_lru_locks osc
1915 echo "Inject failure, to simulate the case of missing the MDT-object"
1916 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1917 do_facet mds1 $LCTL set_param fail_loc=0x1616
1918 rm -f $DIR/$tdir/a1/f1
1920 if [ $MDSCOUNT -ge 2 ]; then
1921 do_facet mds2 $LCTL set_param fail_loc=0x1616
1922 rm -f $DIR/$tdir/a2/f2
1928 do_facet mds1 $LCTL set_param fail_loc=0
1929 if [ $MDSCOUNT -ge 2 ]; then
1930 do_facet mds2 $LCTL set_param fail_loc=0
1933 cancel_lru_locks mdc
1934 cancel_lru_locks osc
1936 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1937 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1939 for k in $(seq $MDSCOUNT); do
1940 # The LFSCK status query internal is 30 seconds. For the case
1941 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1942 # time to guarantee the status sync up.
1943 wait_update_facet mds${k} "$LCTL get_param -n \
1944 mdd.$(facet_svc mds${k}).lfsck_layout |
1945 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1946 error "(2) MDS${k} is not the expected 'completed'"
1949 for k in $(seq $OSTCOUNT); do
1950 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1951 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1952 awk '/^status/ { print $2 }')
1953 [ "$cur_status" == "completed" ] ||
1954 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1957 local repaired=$(do_facet mds1 $LCTL get_param -n \
1958 mdd.$(facet_svc mds1).lfsck_layout |
1959 awk '/^repaired_orphan/ { print $2 }')
1960 [ $repaired -eq 1 ] ||
1961 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1963 if [ $MDSCOUNT -ge 2 ]; then
1964 repaired=$(do_facet mds2 $LCTL get_param -n \
1965 mdd.$(facet_svc mds2).lfsck_layout |
1966 awk '/^repaired_orphan/ { print $2 }')
1967 [ $repaired -eq 2 ] ||
1968 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1971 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1972 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1973 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1975 if [ $MDSCOUNT -ge 2 ]; then
1976 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1977 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1980 $LFS path2fid $DIR/$tdir/a1/f1
1981 $LFS getstripe $DIR/$tdir/a1/f1
1983 if [ $MDSCOUNT -ge 2 ]; then
1984 $LFS path2fid $DIR/$tdir/a2/f2
1985 $LFS getstripe $DIR/$tdir/a2/f2
1988 echo "The file size should be correct after layout LFSCK scanning"
1989 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1990 [ "$cur_size" == "$saved_size" ] ||
1991 error "(7) Expect file1 size $saved_size, but got $cur_size"
1993 if [ $MDSCOUNT -ge 2 ]; then
1994 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1995 [ "$cur_size" == "$saved_size" ] ||
1996 error "(8) Expect file2 size $saved_size, but got $cur_size"
1999 run_test 18b "Find out orphan OST-object and repair it (2)"
2003 echo "The target MDT-object is lost, and the OST-object FID is missing."
2004 echo "The LFSCK should re-create the MDT-object with new FID under the "
2005 echo "directory .lustre/lost+found/MDTxxxx."
2008 check_mount_and_prep
2009 $LFS mkdir -i 0 $DIR/$tdir/a1
2010 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2012 echo "Inject failure, to simulate the case of missing parent FID"
2013 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2014 do_facet ost1 $LCTL set_param fail_loc=0x1617
2016 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2017 $LFS getstripe $DIR/$tdir/a1/f1
2019 if [ $MDSCOUNT -ge 2 ]; then
2020 $LFS mkdir -i 1 $DIR/$tdir/a2
2021 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2022 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2023 $LFS getstripe $DIR/$tdir/a2/f2
2026 cancel_lru_locks osc
2028 echo "Inject failure, to simulate the case of missing the MDT-object"
2029 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2030 do_facet mds1 $LCTL set_param fail_loc=0x1616
2031 rm -f $DIR/$tdir/a1/f1
2033 if [ $MDSCOUNT -ge 2 ]; then
2034 do_facet mds2 $LCTL set_param fail_loc=0x1616
2035 rm -f $DIR/$tdir/a2/f2
2041 do_facet mds1 $LCTL set_param fail_loc=0
2042 if [ $MDSCOUNT -ge 2 ]; then
2043 do_facet mds2 $LCTL set_param fail_loc=0
2046 cancel_lru_locks mdc
2047 cancel_lru_locks osc
2049 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2050 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2052 for k in $(seq $MDSCOUNT); do
2053 # The LFSCK status query internal is 30 seconds. For the case
2054 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2055 # time to guarantee the status sync up.
2056 wait_update_facet mds${k} "$LCTL get_param -n \
2057 mdd.$(facet_svc mds${k}).lfsck_layout |
2058 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2059 error "(2) MDS${k} is not the expected 'completed'"
2062 for k in $(seq $OSTCOUNT); do
2063 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2064 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2065 awk '/^status/ { print $2 }')
2066 [ "$cur_status" == "completed" ] ||
2067 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2070 if [ $MDSCOUNT -ge 2 ]; then
2076 local repaired=$(do_facet mds1 $LCTL get_param -n \
2077 mdd.$(facet_svc mds1).lfsck_layout |
2078 awk '/^repaired_orphan/ { print $2 }')
2079 [ $repaired -eq $expected ] ||
2080 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2082 if [ $MDSCOUNT -ge 2 ]; then
2083 repaired=$(do_facet mds2 $LCTL get_param -n \
2084 mdd.$(facet_svc mds2).lfsck_layout |
2085 awk '/^repaired_orphan/ { print $2 }')
2086 [ $repaired -eq 0 ] ||
2087 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2090 ls -ail $MOUNT/.lustre/lost+found/
2092 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2093 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2094 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2096 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2099 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2100 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2101 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2103 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2104 [ ! -z "$cname" ] ||
2105 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2107 run_test 18c "Find out orphan OST-object and repair it (3)"
2111 echo "The target MDT-object layout EA slot is occpuied by some new"
2112 echo "created OST-object when repair dangling reference case. Such"
2113 echo "conflict OST-object has never been modified. Then when found"
2114 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2118 check_mount_and_prep
2120 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2121 echo "guard" > $DIR/$tdir/a1/f1
2122 echo "foo" > $DIR/$tdir/a1/f2
2123 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2124 $LFS path2fid $DIR/$tdir/a1/f1
2125 $LFS getstripe $DIR/$tdir/a1/f1
2126 $LFS path2fid $DIR/$tdir/a1/f2
2127 $LFS getstripe $DIR/$tdir/a1/f2
2128 cancel_lru_locks osc
2130 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2131 echo "to reference the same OST-object (which is f1's OST-obejct)."
2132 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2133 echo "dangling reference case, but f2's old OST-object is there."
2136 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2138 chown 1.1 $DIR/$tdir/a1/f2
2139 rm -f $DIR/$tdir/a1/f1
2142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2144 echo "stopall to cleanup object cache"
2147 setupall > /dev/null
2149 echo "The file size should be incorrect since dangling referenced"
2150 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2151 [ "$cur_size" != "$saved_size" ] ||
2152 error "(1) Expect incorrect file2 size"
2154 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2155 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2157 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2158 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2160 wait_update_facet mds1 "$LCTL get_param -n \
2161 mdd.$(facet_svc mds1).lfsck_layout |
2162 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2163 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2165 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2167 for k in $(seq $MDSCOUNT); do
2168 # The LFSCK status query internal is 30 seconds. For the case
2169 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2170 # time to guarantee the status sync up.
2171 wait_update_facet mds${k} "$LCTL get_param -n \
2172 mdd.$(facet_svc mds${k}).lfsck_layout |
2173 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2174 error "(3) MDS${k} is not the expected 'completed'"
2177 for k in $(seq $OSTCOUNT); do
2178 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2179 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2180 awk '/^status/ { print $2 }')
2181 [ "$cur_status" == "completed" ] ||
2182 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2185 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2186 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2187 awk '/^repaired_orphan/ { print $2 }')
2188 [ $repaired -eq 1 ] ||
2189 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2191 echo "The file size should be correct after layout LFSCK scanning"
2192 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2193 [ "$cur_size" == "$saved_size" ] ||
2194 error "(6) Expect file2 size $saved_size, but got $cur_size"
2196 echo "The LFSCK should find back the original data."
2197 cat $DIR/$tdir/a1/f2
2198 $LFS path2fid $DIR/$tdir/a1/f2
2199 $LFS getstripe $DIR/$tdir/a1/f2
2201 run_test 18d "Find out orphan OST-object and repair it (4)"
2205 echo "The target MDT-object layout EA slot is occpuied by some new"
2206 echo "created OST-object when repair dangling reference case. Such"
2207 echo "conflict OST-object has been modified by others. To keep the"
2208 echo "new data, the LFSCK will create a new file to refernece this"
2209 echo "old orphan OST-object."
2212 check_mount_and_prep
2214 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2215 echo "guard" > $DIR/$tdir/a1/f1
2216 echo "foo" > $DIR/$tdir/a1/f2
2217 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2218 $LFS path2fid $DIR/$tdir/a1/f1
2219 $LFS getstripe $DIR/$tdir/a1/f1
2220 $LFS path2fid $DIR/$tdir/a1/f2
2221 $LFS getstripe $DIR/$tdir/a1/f2
2222 cancel_lru_locks osc
2224 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2225 echo "to reference the same OST-object (which is f1's OST-obejct)."
2226 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2227 echo "dangling reference case, but f2's old OST-object is there."
2230 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2232 chown 1.1 $DIR/$tdir/a1/f2
2233 rm -f $DIR/$tdir/a1/f1
2236 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2238 echo "stopall to cleanup object cache"
2241 setupall > /dev/null
2243 echo "The file size should be incorrect since dangling referenced"
2244 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2245 [ "$cur_size" != "$saved_size" ] ||
2246 error "(1) Expect incorrect file2 size"
2248 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2249 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2251 start_full_debug_logging
2253 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2254 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2256 wait_update_facet mds1 "$LCTL get_param -n \
2257 mdd.$(facet_svc mds1).lfsck_layout |
2258 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2259 error "(3) MDS1 is not the expected 'scanning-phase2'"
2261 # to guarantee all updates are synced.
2265 echo "Write new data to f2 to modify the new created OST-object."
2266 echo "dummy" >> $DIR/$tdir/a1/f2
2268 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2270 for k in $(seq $MDSCOUNT); do
2271 # The LFSCK status query internal is 30 seconds. For the case
2272 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2273 # time to guarantee the status sync up.
2274 wait_update_facet mds${k} "$LCTL get_param -n \
2275 mdd.$(facet_svc mds${k}).lfsck_layout |
2276 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2277 error "(4) MDS${k} is not the expected 'completed'"
2280 for k in $(seq $OSTCOUNT); do
2281 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2282 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2283 awk '/^status/ { print $2 }')
2284 [ "$cur_status" == "completed" ] ||
2285 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2288 stop_full_debug_logging
2290 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2291 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2292 awk '/^repaired_orphan/ { print $2 }')
2293 [ $repaired -eq 1 ] ||
2294 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2296 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2297 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2298 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2300 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2301 [ ! -z "$cname" ] ||
2302 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2304 echo "The stub file should keep the original f2 data"
2305 cur_size=$(ls -il $cname | awk '{ print $6 }')
2306 [ "$cur_size" == "$saved_size" ] ||
2307 error "(9) Expect file2 size $saved_size, but got $cur_size"
2310 $LFS path2fid $cname
2311 $LFS getstripe $cname
2313 echo "The f2 should contains new data."
2314 cat $DIR/$tdir/a1/f2
2315 $LFS path2fid $DIR/$tdir/a1/f2
2316 $LFS getstripe $DIR/$tdir/a1/f2
2318 run_test 18e "Find out orphan OST-object and repair it (5)"
2321 [ $OSTCOUNT -lt 2 ] &&
2322 skip "The test needs at least 2 OSTs" && return
2325 echo "The target MDT-object is lost. The LFSCK should re-create the"
2326 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2327 echo "to verify some OST-object(s) during the first stage-scanning,"
2328 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2329 echo "should not be affected."
2332 check_mount_and_prep
2333 $LFS mkdir -i 0 $DIR/$tdir/a1
2334 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2335 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2336 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2337 $LFS mkdir -i 0 $DIR/$tdir/a2
2338 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2339 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2340 $LFS getstripe $DIR/$tdir/a1/f1
2341 $LFS getstripe $DIR/$tdir/a2/f2
2343 if [ $MDSCOUNT -ge 2 ]; then
2344 $LFS mkdir -i 1 $DIR/$tdir/a3
2345 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2346 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2347 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2348 $LFS mkdir -i 1 $DIR/$tdir/a4
2349 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2350 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2351 $LFS getstripe $DIR/$tdir/a3/f3
2352 $LFS getstripe $DIR/$tdir/a4/f4
2355 cancel_lru_locks osc
2357 echo "Inject failure, to simulate the case of missing the MDT-object"
2358 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2359 do_facet mds1 $LCTL set_param fail_loc=0x1616
2360 rm -f $DIR/$tdir/a1/f1
2361 rm -f $DIR/$tdir/a2/f2
2363 if [ $MDSCOUNT -ge 2 ]; then
2364 do_facet mds2 $LCTL set_param fail_loc=0x1616
2365 rm -f $DIR/$tdir/a3/f3
2366 rm -f $DIR/$tdir/a4/f4
2372 do_facet mds1 $LCTL set_param fail_loc=0
2373 if [ $MDSCOUNT -ge 2 ]; then
2374 do_facet mds2 $LCTL set_param fail_loc=0
2377 cancel_lru_locks mdc
2378 cancel_lru_locks osc
2380 echo "Inject failure, to simulate the OST0 fail to handle"
2381 echo "MDT0 LFSCK request during the first-stage scanning."
2382 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2383 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2385 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2386 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2388 for k in $(seq $MDSCOUNT); do
2389 # The LFSCK status query internal is 30 seconds. For the case
2390 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2391 # time to guarantee the status sync up.
2392 wait_update_facet mds${k} "$LCTL get_param -n \
2393 mdd.$(facet_svc mds${k}).lfsck_layout |
2394 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2395 error "(2) MDS${k} is not the expected 'partial'"
2398 wait_update_facet ost1 "$LCTL get_param -n \
2399 obdfilter.$(facet_svc ost1).lfsck_layout |
2400 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2401 error "(3) OST1 is not the expected 'partial'"
2404 wait_update_facet ost2 "$LCTL get_param -n \
2405 obdfilter.$(facet_svc ost2).lfsck_layout |
2406 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2407 error "(4) OST2 is not the expected 'completed'"
2410 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2412 local repaired=$(do_facet mds1 $LCTL get_param -n \
2413 mdd.$(facet_svc mds1).lfsck_layout |
2414 awk '/^repaired_orphan/ { print $2 }')
2415 [ $repaired -eq 1 ] ||
2416 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2418 if [ $MDSCOUNT -ge 2 ]; then
2419 repaired=$(do_facet mds2 $LCTL get_param -n \
2420 mdd.$(facet_svc mds2).lfsck_layout |
2421 awk '/^repaired_orphan/ { print $2 }')
2422 [ $repaired -eq 1 ] ||
2423 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2426 echo "Trigger layout LFSCK on all devices again to cleanup"
2427 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2429 for k in $(seq $MDSCOUNT); do
2430 # The LFSCK status query internal is 30 seconds. For the case
2431 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2432 # time to guarantee the status sync up.
2433 wait_update_facet mds${k} "$LCTL get_param -n \
2434 mdd.$(facet_svc mds${k}).lfsck_layout |
2435 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2436 error "(8) MDS${k} is not the expected 'completed'"
2439 for k in $(seq $OSTCOUNT); do
2440 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2441 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2442 awk '/^status/ { print $2 }')
2443 [ "$cur_status" == "completed" ] ||
2444 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2448 local repaired=$(do_facet mds1 $LCTL get_param -n \
2449 mdd.$(facet_svc mds1).lfsck_layout |
2450 awk '/^repaired_orphan/ { print $2 }')
2451 [ $repaired -eq 2 ] ||
2452 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2454 if [ $MDSCOUNT -ge 2 ]; then
2455 repaired=$(do_facet mds2 $LCTL get_param -n \
2456 mdd.$(facet_svc mds2).lfsck_layout |
2457 awk '/^repaired_orphan/ { print $2 }')
2458 [ $repaired -eq 2 ] ||
2459 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2462 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2464 $LCTL set_param debug=-cache > /dev/null
2467 check_mount_and_prep
2468 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2470 echo "foo" > $DIR/$tdir/a0
2471 echo "guard" > $DIR/$tdir/a1
2472 cancel_lru_locks osc
2474 echo "Inject failure, then client will offer wrong parent FID when read"
2475 do_facet ost1 $LCTL set_param -n \
2476 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2477 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2478 $LCTL set_param fail_loc=0x1619
2480 echo "Read RPC with wrong parent FID should be denied"
2481 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2482 $LCTL set_param fail_loc=0
2484 run_test 19a "OST-object inconsistency self detect"
2487 check_mount_and_prep
2488 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2490 echo "Inject failure stub to make the OST-object to back point to"
2491 echo "non-exist MDT-object"
2493 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2494 do_facet ost1 $LCTL set_param fail_loc=0x1611
2495 echo "foo" > $DIR/$tdir/f0
2496 cancel_lru_locks osc
2497 do_facet ost1 $LCTL set_param fail_loc=0
2499 echo "Nothing should be fixed since self detect and repair is disabled"
2500 local repaired=$(do_facet ost1 $LCTL get_param -n \
2501 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2502 awk '/^repaired/ { print $2 }')
2503 [ $repaired -eq 0 ] ||
2504 error "(1) Expected 0 repaired, but got $repaired"
2506 echo "Read RPC with right parent FID should be accepted,"
2507 echo "and cause parent FID on OST to be fixed"
2509 do_facet ost1 $LCTL set_param -n \
2510 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2511 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2513 repaired=$(do_facet ost1 $LCTL get_param -n \
2514 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2515 awk '/^repaired/ { print $2 }')
2516 [ $repaired -eq 1 ] ||
2517 error "(3) Expected 1 repaired, but got $repaired"
2519 run_test 19b "OST-object inconsistency self repair"
2522 [ $OSTCOUNT -lt 2 ] &&
2523 skip "The test needs at least 2 OSTs" && return
2526 echo "The target MDT-object and some of its OST-object are lost."
2527 echo "The LFSCK should find out the left OST-objects and re-create"
2528 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2529 echo "with the partial OST-objects (LOV EA hole)."
2531 echo "New client can access the file with LOV EA hole via normal"
2532 echo "system tools or commands without crash the system."
2534 echo "For old client, even though it cannot access the file with"
2535 echo "LOV EA hole, it should not cause the system crash."
2538 check_mount_and_prep
2539 $LFS mkdir -i 0 $DIR/$tdir/a1
2540 if [ $OSTCOUNT -gt 2 ]; then
2541 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2544 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2548 # 256 blocks on the stripe0.
2549 # 1 block on the stripe1 for 2 OSTs case.
2550 # 256 blocks on the stripe1 for other cases.
2551 # 1 block on the stripe2 if OSTs > 2
2552 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2553 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2554 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2556 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2557 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2558 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2561 $LFS getstripe $DIR/$tdir/a1/f0
2563 $LFS getstripe $DIR/$tdir/a1/f1
2565 $LFS getstripe $DIR/$tdir/a1/f2
2567 if [ $OSTCOUNT -gt 2 ]; then
2568 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2569 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2571 $LFS getstripe $DIR/$tdir/a1/f3
2574 cancel_lru_locks osc
2576 echo "Inject failure..."
2577 echo "To simulate f0 lost MDT-object"
2578 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2579 do_facet mds1 $LCTL set_param fail_loc=0x1616
2580 rm -f $DIR/$tdir/a1/f0
2582 echo "To simulate f1 lost MDT-object and OST-object0"
2583 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2584 do_facet mds1 $LCTL set_param fail_loc=0x161a
2585 rm -f $DIR/$tdir/a1/f1
2587 echo "To simulate f2 lost MDT-object and OST-object1"
2588 do_facet mds1 $LCTL set_param fail_val=1
2589 rm -f $DIR/$tdir/a1/f2
2591 if [ $OSTCOUNT -gt 2 ]; then
2592 echo "To simulate f3 lost MDT-object and OST-object2"
2593 do_facet mds1 $LCTL set_param fail_val=2
2594 rm -f $DIR/$tdir/a1/f3
2597 umount_client $MOUNT
2600 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2602 echo "Inject failure to slow down the LFSCK on OST0"
2603 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2604 do_facet ost1 $LCTL set_param fail_loc=0x161b
2606 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2607 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2610 do_facet ost1 $LCTL set_param fail_loc=0
2612 for k in $(seq $MDSCOUNT); do
2613 # The LFSCK status query internal is 30 seconds. For the case
2614 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2615 # time to guarantee the status sync up.
2616 wait_update_facet mds${k} "$LCTL get_param -n \
2617 mdd.$(facet_svc mds${k}).lfsck_layout |
2618 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2619 error "(2) MDS${k} is not the expected 'completed'"
2622 for k in $(seq $OSTCOUNT); do
2623 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2624 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2625 awk '/^status/ { print $2 }')
2626 [ "$cur_status" == "completed" ] ||
2627 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2630 local repaired=$(do_facet mds1 $LCTL get_param -n \
2631 mdd.$(facet_svc mds1).lfsck_layout |
2632 awk '/^repaired_orphan/ { print $2 }')
2633 if [ $OSTCOUNT -gt 2 ]; then
2634 [ $repaired -eq 9 ] ||
2635 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2637 [ $repaired -eq 4 ] ||
2638 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2641 mount_client $MOUNT || error "(5.0) Fail to start client!"
2643 LOV_PATTERN_F_HOLE=0x40000000
2646 # ${fid0}-R-0 is the old f0
2648 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2649 echo "Check $name, which is the old f0"
2651 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2653 local pattern=0x$($LFS getstripe -L $name)
2654 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2655 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2657 local stripes=$($LFS getstripe -c $name)
2658 if [ $OSTCOUNT -gt 2 ]; then
2659 [ $stripes -eq 3 ] ||
2660 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2662 [ $stripes -eq 2 ] ||
2663 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2666 local size=$(stat $name | awk '/Size:/ { print $2 }')
2667 [ $size -eq $((4096 * $bcount)) ] ||
2668 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2670 cat $name > /dev/null || error "(5.5) cannot read $name"
2672 echo "dummy" >> $name || error "(5.6) cannot write $name"
2674 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2676 touch $name || error "(5.8) cannot touch $name"
2678 rm -f $name || error "(5.9) cannot unlink $name"
2681 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2683 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2684 if [ $OSTCOUNT -gt 2 ]; then
2685 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2687 echo "Check $name, it contains the old f1's stripe1"
2690 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2692 pattern=0x$($LFS getstripe -L $name)
2693 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2694 error "(6.2) expect pattern flag hole, but got $pattern"
2696 stripes=$($LFS getstripe -c $name)
2697 if [ $OSTCOUNT -gt 2 ]; then
2698 [ $stripes -eq 3 ] ||
2699 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2701 [ $stripes -eq 2 ] ||
2702 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2705 size=$(stat $name | awk '/Size:/ { print $2 }')
2706 [ $size -eq $((4096 * $bcount)) ] ||
2707 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2709 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2711 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2712 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2715 [ $failures -eq 256 ] ||
2716 error "(6.6) expect 256 IO failures, but get $failures"
2718 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2719 [ $size -eq $((4096 * $bcount)) ] ||
2720 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2722 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2723 error "(6.8) write to the LOV EA hole should fail"
2725 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2726 error "(6.9) write to normal stripe should NOT fail"
2728 echo "foo" >> $name && error "(6.10) append write $name should fail"
2730 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2732 touch $name || error "(6.12) cannot touch $name"
2734 rm -f $name || error "(6.13) cannot unlink $name"
2737 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2739 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2740 if [ $OSTCOUNT -gt 2 ]; then
2741 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2743 echo "Check $name, it contains the old f2's stripe0"
2746 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2748 pattern=0x$($LFS getstripe -L $name)
2749 stripes=$($LFS getstripe -c $name)
2750 size=$(stat $name | awk '/Size:/ { print $2 }')
2751 if [ $OSTCOUNT -gt 2 ]; then
2752 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2753 error "(7.2.1) expect pattern flag hole, but got $pattern"
2755 [ $stripes -eq 3 ] ||
2756 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2758 [ $size -eq $((4096 * $bcount)) ] ||
2759 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2761 cat $name > /dev/null &&
2762 error "(7.5.1) normal read $name should fail"
2764 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2765 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2767 [ $failures -eq 256 ] ||
2768 error "(7.6) expect 256 IO failures, but get $failures"
2770 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2771 [ $size -eq $((4096 * $bcount)) ] ||
2772 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2774 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2775 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2777 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2778 error "(7.8.1) write to normal stripe should NOT fail"
2780 echo "foo" >> $name &&
2781 error "(7.8.3) append write $name should fail"
2783 chown $RUNAS_ID:$RUNAS_GID $name ||
2784 error "(7.9.1) cannot chown on $name"
2786 touch $name || error "(7.10.1) cannot touch $name"
2788 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2789 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2791 [ $stripes -eq 1 ] ||
2792 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2795 [ $size -eq $((4096 * (256 + 0))) ] ||
2796 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2798 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2800 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2802 chown $RUNAS_ID:$RUNAS_GID $name ||
2803 error "(7.9.2) cannot chown on $name"
2805 touch $name || error "(7.10.2) cannot touch $name"
2808 rm -f $name || error "(7.11) cannot unlink $name"
2810 [ $OSTCOUNT -le 2 ] && return
2813 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2815 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2816 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2818 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2820 pattern=0x$($LFS getstripe -L $name)
2821 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2822 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2824 stripes=$($LFS getstripe -c $name)
2825 # LFSCK does not know the old f3 had 3 stripes.
2826 # It only tries to find as much as possible.
2827 # The stripe count depends on the last stripe's offset.
2828 [ $stripes -eq 2 ] ||
2829 error "(8.3) expect the stripe count is 2, but got $stripes"
2831 size=$(stat $name | awk '/Size:/ { print $2 }')
2833 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2834 error "(8.4) expect the size $((4096 * 512)), but got $size"
2836 cat $name > /dev/null || error "(8.5) cannot read $name"
2838 echo "dummy" >> $name || error "(8.6) cannot write $name"
2840 chown $RUNAS_ID:$RUNAS_GID $name ||
2841 error "(8.7) cannot chown on $name"
2843 touch $name || error "(8.8) cannot touch $name"
2845 rm -f $name || error "(8.9) cannot unlink $name"
2847 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2850 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2851 skip "ignore the test if MDS is older than 2.5.59" && return
2853 check_mount_and_prep
2854 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2856 echo "Start all LFSCK components by default (-s 1)"
2857 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2858 error "Fail to start LFSCK"
2860 echo "namespace LFSCK should be in 'scanning-phase1' status"
2861 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2862 [ "$STATUS" == "scanning-phase1" ] ||
2863 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2865 echo "layout LFSCK should be in 'scanning-phase1' status"
2866 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2867 [ "$STATUS" == "scanning-phase1" ] ||
2868 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2870 echo "Stop all LFSCK components by default"
2871 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2872 error "Fail to stop LFSCK"
2874 run_test 21 "run all LFSCK components by default"
2877 [ $MDSCOUNT -lt 2 ] &&
2878 skip "We need at least 2 MDSes for this test" && return
2881 echo "The parent_A references the child directory via some name entry,"
2882 echo "but the child directory back references another parent_B via its"
2883 echo "".." name entry. The parent_B does not exist. Then the namespace"
2884 echo "LFSCK will repair the child directory's ".." name entry."
2887 check_mount_and_prep
2889 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2890 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2892 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2893 echo "The dummy's dotdot name entry references the guard."
2894 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2896 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2897 error "(3) Fail to mkdir on MDT0"
2898 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2900 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2902 echo "Trigger namespace LFSCK to repair unmatched pairs"
2903 $START_NAMESPACE -A -r ||
2904 error "(5) Fail to start LFSCK for namespace"
2906 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2907 mdd.${MDT_DEV}.lfsck_namespace |
2908 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2910 error "(6) unexpected status"
2913 local repaired=$($SHOW_NAMESPACE |
2914 awk '/^unmatched_pairs_repaired/ { print $2 }')
2915 [ $repaired -eq 1 ] ||
2916 error "(7) Fail to repair unmatched pairs: $repaired"
2918 echo "'ls' should success after namespace LFSCK repairing"
2919 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2920 error "(8) ls should success."
2922 run_test 22a "LFSCK can repair unmatched pairs (1)"
2925 [ $MDSCOUNT -lt 2 ] &&
2926 skip "We need at least 2 MDSes for this test" && return
2929 echo "The parent_A references the child directory via the name entry_B,"
2930 echo "but the child directory back references another parent_C via its"
2931 echo "".." name entry. The parent_C exists, but there is no the name"
2932 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2933 echo "the child directory's ".." name entry and its linkEA."
2936 check_mount_and_prep
2938 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2939 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2941 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2942 echo "and bad linkEA. The dummy's dotdot name entry references the"
2943 echo "guard. The dummy's linkEA references n non-exist name entry."
2944 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2946 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2947 error "(3) Fail to mkdir on MDT0"
2948 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2950 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2951 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2952 local dummyname=$($LFS fid2path $DIR $dummyfid)
2953 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2954 error "(4) fid2path works unexpectedly."
2956 echo "Trigger namespace LFSCK to repair unmatched pairs"
2957 $START_NAMESPACE -A -r ||
2958 error "(5) Fail to start LFSCK for namespace"
2960 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2961 mdd.${MDT_DEV}.lfsck_namespace |
2962 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2964 error "(6) unexpected status"
2967 local repaired=$($SHOW_NAMESPACE |
2968 awk '/^unmatched_pairs_repaired/ { print $2 }')
2969 [ $repaired -eq 1 ] ||
2970 error "(7) Fail to repair unmatched pairs: $repaired"
2972 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2973 local dummyname=$($LFS fid2path $DIR $dummyfid)
2974 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2975 error "(8) fid2path does not work"
2977 run_test 22b "LFSCK can repair unmatched pairs (2)"
2980 [ $MDSCOUNT -lt 2 ] &&
2981 skip "We need at least 2 MDSes for this test" && return
2984 echo "The name entry is there, but the MDT-object for such name "
2985 echo "entry does not exist. The namespace LFSCK should find out "
2986 echo "and repair the inconsistency as required."
2989 check_mount_and_prep
2991 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2992 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2994 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2995 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2996 do_facet mds2 $LCTL set_param fail_loc=0x1620
2997 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2998 do_facet mds2 $LCTL set_param fail_loc=0
3000 echo "'ls' should fail because of dangling name entry"
3001 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3003 echo "Trigger namespace LFSCK to find out dangling name entry"
3004 $START_NAMESPACE -A -r ||
3005 error "(5) Fail to start LFSCK for namespace"
3007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3008 mdd.${MDT_DEV}.lfsck_namespace |
3009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3011 error "(6) unexpected status"
3014 local repaired=$($SHOW_NAMESPACE |
3015 awk '/^dangling_repaired/ { print $2 }')
3016 [ $repaired -eq 1 ] ||
3017 error "(7) Fail to repair dangling name entry: $repaired"
3019 echo "'ls' should fail because not re-create MDT-object by default"
3020 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3022 echo "Trigger namespace LFSCK again to repair dangling name entry"
3023 $START_NAMESPACE -A -r -C ||
3024 error "(9) Fail to start LFSCK for namespace"
3026 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3027 mdd.${MDT_DEV}.lfsck_namespace |
3028 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3030 error "(10) unexpected status"
3033 repaired=$($SHOW_NAMESPACE |
3034 awk '/^dangling_repaired/ { print $2 }')
3035 [ $repaired -eq 1 ] ||
3036 error "(11) Fail to repair dangling name entry: $repaired"
3038 echo "'ls' should success after namespace LFSCK repairing"
3039 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3041 run_test 23a "LFSCK can repair dangling name entry (1)"
3045 echo "The objectA has multiple hard links, one of them corresponding"
3046 echo "to the name entry_B. But there is something wrong for the name"
3047 echo "entry_B and cause entry_B to references non-exist object_C."
3048 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3049 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3050 echo "comes to the second-stage scanning, it will find that the"
3051 echo "former re-creating object_C is not proper, and will try to"
3052 echo "replace the object_C with the real object_A."
3055 check_mount_and_prep
3057 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3058 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3059 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3061 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3062 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3063 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3064 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3067 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3069 echo "'ls' should fail because of dangling name entry"
3070 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3071 error "(6) ls should fail."
3073 echo "Trigger namespace LFSCK to find out dangling name entry"
3074 $START_NAMESPACE -r -C ||
3075 error "(7) Fail to start LFSCK for namespace"
3077 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3078 mdd.${MDT_DEV}.lfsck_namespace |
3079 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3081 error "(8) unexpected status"
3084 local repaired=$($SHOW_NAMESPACE |
3085 awk '/^dangling_repaired/ { print $2 }')
3086 [ $repaired -eq 1 ] ||
3087 error "(9) Fail to repair dangling name entry: $repaired"
3089 repaired=$($SHOW_NAMESPACE |
3090 awk '/^multiple_linked_repaired/ { print $2 }')
3091 [ $repaired -eq 1 ] ||
3092 error "(10) Fail to drop the former created object: $repaired"
3094 local data=$(cat $DIR/$tdir/d0/foo)
3095 [ "$data" == "dummy" ] ||
3096 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3098 run_test 23b "LFSCK can repair dangling name entry (2)"
3102 echo "The objectA has multiple hard links, one of them corresponding"
3103 echo "to the name entry_B. But there is something wrong for the name"
3104 echo "entry_B and cause entry_B to references non-exist object_C."
3105 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3106 echo "as dangling, and re-create the lost object_C. And then others"
3107 echo "modified the re-created object_C. When the LFSCK comes to the"
3108 echo "second-stage scanning, it will find that the former re-creating"
3109 echo "object_C maybe wrong and try to replace the object_C with the"
3110 echo "real object_A. But because object_C has been modified, so the"
3111 echo "LFSCK cannot replace it."
3114 check_mount_and_prep
3116 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3117 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3118 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3120 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3121 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3122 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3123 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3124 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3126 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3128 echo "'ls' should fail because of dangling name entry"
3129 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3130 error "(6) ls should fail."
3132 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3133 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3135 echo "Trigger namespace LFSCK to find out dangling name entry"
3136 $START_NAMESPACE -r -C ||
3137 error "(7) Fail to start LFSCK for namespace"
3139 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3140 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3141 stat $DIR/$tdir/guard
3143 error "(8) unexpected size"
3146 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3147 cancel_lru_locks osc
3149 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3150 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3151 mdd.${MDT_DEV}.lfsck_namespace |
3152 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3154 error "(10) unexpected status"
3157 local repaired=$($SHOW_NAMESPACE |
3158 awk '/^dangling_repaired/ { print $2 }')
3159 [ $repaired -eq 1 ] ||
3160 error "(11) Fail to repair dangling name entry: $repaired"
3162 local data=$(cat $DIR/$tdir/d0/foo)
3163 [ "$data" != "dummy" ] ||
3164 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3166 run_test 23c "LFSCK can repair dangling name entry (3)"
3169 [ $MDSCOUNT -lt 2 ] &&
3170 skip "We need at least 2 MDSes for this test" && return
3173 echo "Two MDT-objects back reference the same name entry via their"
3174 echo "each own linkEA entry, but the name entry only references one"
3175 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3176 echo "for the MDT-object that is not recognized. If such MDT-object"
3177 echo "has no other linkEA entry after the removing, then the LFSCK"
3178 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3181 check_mount_and_prep
3183 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3185 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3186 $LFS path2fid $DIR/$tdir/d0/guard
3188 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3189 $LFS path2fid $DIR/$tdir/d0/dummy
3192 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3193 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3195 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3198 touch $DIR/$tdir/d0/guard/foo ||
3199 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3201 echo "Inject failure stub on MDT0 to simulate the case that"
3202 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3203 echo "that references $DIR/$tdir/d0/guard/foo."
3204 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3205 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3206 echo "there with the same linkEA entry as another MDT-object"
3207 echo "$DIR/$tdir/d0/guard/foo has"
3209 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3211 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3212 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3213 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3214 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3215 rmdir $DIR/$tdir/d0/dummy/foo ||
3216 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3217 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3219 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3220 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3221 error "(6) stat successfully unexpectedly"
3223 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3224 $START_NAMESPACE -A -r ||
3225 error "(7) Fail to start LFSCK for namespace"
3227 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3228 mdd.${MDT_DEV}.lfsck_namespace |
3229 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3231 error "(8) unexpected status"
3234 local repaired=$($SHOW_NAMESPACE |
3235 awk '/^multiple_referenced_repaired/ { print $2 }')
3236 [ $repaired -eq 1 ] ||
3237 error "(9) Fail to repair multiple referenced name entry: $repaired"
3239 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3240 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3241 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3243 local cname="$cfid-$pfid-D-0"
3244 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3245 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3247 run_test 24 "LFSCK can repair multiple-referenced name entry"
3250 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3251 skip "Only support to inject failure on ldiskfs" && return
3254 echo "The file type in the name entry does not match the file type"
3255 echo "claimed by the referenced object. Then the LFSCK will update"
3256 echo "the file type in the name entry."
3259 check_mount_and_prep
3261 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3263 echo "Inject failure stub on MDT0 to simulate the case that"
3264 echo "the file type stored in the name entry is wrong."
3266 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3268 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3271 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3274 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3275 mdd.${MDT_DEV}.lfsck_namespace |
3276 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3278 error "(4) unexpected status"
3281 local repaired=$($SHOW_NAMESPACE |
3282 awk '/^bad_file_type_repaired/ { print $2 }')
3283 [ $repaired -eq 1 ] ||
3284 error "(5) Fail to repair bad file type in name entry: $repaired"
3286 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3288 run_test 25 "LFSCK can repair bad file type in the name entry"
3292 echo "The local name entry back referenced by the MDT-object is lost."
3293 echo "The namespace LFSCK will add the missing local name entry back"
3294 echo "to the normal namespace."
3297 check_mount_and_prep
3299 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3300 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3301 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3303 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3304 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3306 echo "Inject failure stub on MDT0 to simulate the case that"
3307 echo "foo's name entry will be removed, but the foo's object"
3308 echo "and its linkEA are kept in the system."
3310 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3311 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3312 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3315 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3317 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3318 $START_NAMESPACE -r -A ||
3319 error "(6) Fail to start LFSCK for namespace"
3321 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3322 mdd.${MDT_DEV}.lfsck_namespace |
3323 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3325 error "(7) unexpected status"
3328 local repaired=$($SHOW_NAMESPACE |
3329 awk '/^lost_dirent_repaired/ { print $2 }')
3330 [ $repaired -eq 1 ] ||
3331 error "(8) Fail to repair lost dirent: $repaired"
3333 ls -ail $DIR/$tdir/d0/foo ||
3334 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3336 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3337 [ "$foofid" == "$foofid2" ] ||
3338 error "(10) foo's FID changed: $foofid, $foofid2"
3340 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3343 [ $MDSCOUNT -lt 2 ] &&
3344 skip "We need at least 2 MDSes for this test" && return
3347 echo "The remote name entry back referenced by the MDT-object is lost."
3348 echo "The namespace LFSCK will add the missing remote name entry back"
3349 echo "to the normal namespace."
3352 check_mount_and_prep
3354 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3355 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3356 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3358 echo "Inject failure stub on MDT0 to simulate the case that"
3359 echo "foo's name entry will be removed, but the foo's object"
3360 echo "and its linkEA are kept in the system."
3362 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3364 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3367 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3369 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3370 $START_NAMESPACE -r -A ||
3371 error "(5) Fail to start LFSCK for namespace"
3373 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3374 mdd.${MDT_DEV}.lfsck_namespace |
3375 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3377 error "(6) unexpected status"
3380 local repaired=$($SHOW_NAMESPACE |
3381 awk '/^lost_dirent_repaired/ { print $2 }')
3382 [ $repaired -eq 1 ] ||
3383 error "(7) Fail to repair lost dirent: $repaired"
3385 ls -ail $DIR/$tdir/d0/foo ||
3386 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3388 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3389 [ "$foofid" == "$foofid2" ] ||
3390 error "(9) foo's FID changed: $foofid, $foofid2"
3392 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3396 echo "The local parent referenced by the MDT-object linkEA is lost."
3397 echo "The namespace LFSCK will re-create the lost parent as orphan."
3400 check_mount_and_prep
3402 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3403 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3404 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3405 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3407 echo "Inject failure stub on MDT0 to simulate the case that"
3408 echo "foo's name entry will be removed, but the foo's object"
3409 echo "and its linkEA are kept in the system. And then remove"
3410 echo "another hard link and the parent directory."
3412 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3413 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3414 rm -f $DIR/$tdir/d0/foo ||
3415 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3416 rm -f $DIR/$tdir/d0/dummy ||
3417 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3418 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3420 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3421 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3423 echo "Trigger namespace LFSCK to repair the lost parent"
3424 $START_NAMESPACE -r -A ||
3425 error "(6) Fail to start LFSCK for namespace"
3427 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3428 mdd.${MDT_DEV}.lfsck_namespace |
3429 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3431 error "(7) unexpected status"
3434 local repaired=$($SHOW_NAMESPACE |
3435 awk '/^lost_dirent_repaired/ { print $2 }')
3436 [ $repaired -eq 1 ] ||
3437 error "(8) Fail to repair lost dirent: $repaired"
3439 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3440 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3441 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3443 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3445 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3446 [ ! -z "$cname" ] ||
3447 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3449 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3452 [ $MDSCOUNT -lt 2 ] &&
3453 skip "We need at least 2 MDSes for this test" && return
3456 echo "The remote parent referenced by the MDT-object linkEA is lost."
3457 echo "The namespace LFSCK will re-create the lost parent as orphan."
3460 check_mount_and_prep
3462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3463 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3465 $LFS path2fid $DIR/$tdir/d0
3467 echo "Inject failure stub on MDT0 to simulate the case that"
3468 echo "foo's name entry will be removed, but the foo's object"
3469 echo "and its linkEA are kept in the system. And then remove"
3470 echo "the parent directory."
3472 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3473 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3474 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3475 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3477 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3478 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3480 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3481 $START_NAMESPACE -r -A ||
3482 error "(6) Fail to start LFSCK for namespace"
3484 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3485 mdd.${MDT_DEV}.lfsck_namespace |
3486 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3488 error "(7) unexpected status"
3491 local repaired=$($SHOW_NAMESPACE |
3492 awk '/^lost_dirent_repaired/ { print $2 }')
3493 [ $repaired -eq 1 ] ||
3494 error "(8) Fail to repair lost dirent: $repaired"
3496 ls -ail $MOUNT/.lustre/lost+found/
3498 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3499 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3500 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3502 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3504 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3505 [ ! -z "$cname" ] ||
3506 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3508 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3511 [ $MDSCOUNT -lt 2 ] &&
3512 skip "The test needs at least 2 MDTs" && return
3515 echo "The target name entry is lost. The LFSCK should insert the"
3516 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3517 echo "the MDT (on which the orphan MDT-object resides) has ever"
3518 echo "failed to respond some name entry verification during the"
3519 echo "first stage-scanning, then the LFSCK should skip to handle"
3520 echo "orphan MDT-object on this MDT. But other MDTs should not"
3524 check_mount_and_prep
3525 $LFS mkdir -i 0 $DIR/$tdir/d1
3526 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3527 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3529 $LFS mkdir -i 1 $DIR/$tdir/d2
3530 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3531 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3533 echo "Inject failure stub on MDT0 to simulate the case that"
3534 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3535 echo "and its linkEA are kept in the system. And the case that"
3536 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3537 echo "and its linkEA are kept in the system."
3539 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3540 do_facet mds1 $LCTL set_param fail_loc=0x1624
3541 do_facet mds2 $LCTL set_param fail_loc=0x1624
3542 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3543 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3544 do_facet mds1 $LCTL set_param fail_loc=0
3545 do_facet mds2 $LCTL set_param fail_loc=0
3547 cancel_lru_locks mdc
3548 cancel_lru_locks osc
3550 echo "Inject failure, to simulate the MDT0 fail to handle"
3551 echo "MDT1 LFSCK request during the first-stage scanning."
3552 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3553 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3555 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3556 $START_NAMESPACE -r -A ||
3557 error "(3) Fail to start LFSCK for namespace"
3559 wait_update_facet mds1 "$LCTL get_param -n \
3560 mdd.$(facet_svc mds1).lfsck_namespace |
3561 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3562 error "(4) mds1 is not the expected 'partial'"
3565 wait_update_facet mds2 "$LCTL get_param -n \
3566 mdd.$(facet_svc mds2).lfsck_namespace |
3567 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3568 error "(5) mds2 is not the expected 'completed'"
3571 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3573 local repaired=$(do_facet mds1 $LCTL get_param -n \
3574 mdd.$(facet_svc mds1).lfsck_namespace |
3575 awk '/^lost_dirent_repaired/ { print $2 }')
3576 [ $repaired -eq 0 ] ||
3577 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3579 repaired=$(do_facet mds2 $LCTL get_param -n \
3580 mdd.$(facet_svc mds2).lfsck_namespace |
3581 awk '/^lost_dirent_repaired/ { print $2 }')
3582 [ $repaired -eq 1 ] ||
3583 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3585 echo "Trigger namespace LFSCK on all devices again to cleanup"
3586 $START_NAMESPACE -r -A ||
3587 error "(8) Fail to start LFSCK for namespace"
3589 for k in $(seq $MDSCOUNT); do
3590 # The LFSCK status query internal is 30 seconds. For the case
3591 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3592 # time to guarantee the status sync up.
3593 wait_update_facet mds${k} "$LCTL get_param -n \
3594 mdd.$(facet_svc mds${k}).lfsck_namespace |
3595 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3596 error "(9) MDS${k} is not the expected 'completed'"
3599 local repaired=$(do_facet mds1 $LCTL get_param -n \
3600 mdd.$(facet_svc mds1).lfsck_namespace |
3601 awk '/^lost_dirent_repaired/ { print $2 }')
3602 [ $repaired -eq 1 ] ||
3603 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3605 repaired=$(do_facet mds2 $LCTL get_param -n \
3606 mdd.$(facet_svc mds2).lfsck_namespace |
3607 awk '/^lost_dirent_repaired/ { print $2 }')
3608 [ $repaired -eq 0 ] ||
3609 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3611 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3615 echo "The object's nlink attribute is larger than the object's known"
3616 echo "name entries count. The LFSCK will repair the object's nlink"
3617 echo "attribute to match the known name entries count"
3620 check_mount_and_prep
3622 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3623 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3625 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3626 echo "nlink attribute is larger than its name entries count."
3628 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3630 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3631 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3632 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3634 cancel_lru_locks mdc
3635 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3636 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3638 echo "Trigger namespace LFSCK to repair the nlink count"
3639 $START_NAMESPACE -r -A ||
3640 error "(5) Fail to start LFSCK for namespace"
3642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3643 mdd.${MDT_DEV}.lfsck_namespace |
3644 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3646 error "(6) unexpected status"
3649 local repaired=$($SHOW_NAMESPACE |
3650 awk '/^nlinks_repaired/ { print $2 }')
3651 [ $repaired -eq 1 ] ||
3652 error "(7) Fail to repair nlink count: $repaired"
3654 cancel_lru_locks mdc
3655 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3656 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3658 run_test 29a "LFSCK can repair bad nlink count (1)"
3662 echo "The object's nlink attribute is smaller than the object's known"
3663 echo "name entries count. The LFSCK will repair the object's nlink"
3664 echo "attribute to match the known name entries count"
3667 check_mount_and_prep
3669 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3670 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3672 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3673 echo "nlink attribute is smaller than its name entries count."
3675 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3676 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3677 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3678 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3681 cancel_lru_locks mdc
3682 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3683 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3685 echo "Trigger namespace LFSCK to repair the nlink count"
3686 $START_NAMESPACE -r -A ||
3687 error "(5) Fail to start LFSCK for namespace"
3689 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3690 mdd.${MDT_DEV}.lfsck_namespace |
3691 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3693 error "(6) unexpected status"
3696 local repaired=$($SHOW_NAMESPACE |
3697 awk '/^nlinks_repaired/ { print $2 }')
3698 [ $repaired -eq 1 ] ||
3699 error "(7) Fail to repair nlink count: $repaired"
3701 cancel_lru_locks mdc
3702 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3703 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3705 run_test 29b "LFSCK can repair bad nlink count (2)"
3709 echo "There are too many hard links to the object, and exceeds the"
3710 echo "object's linkEA limitation, as to NOT all the known name entries"
3711 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3712 echo "skip the nlink verification for this object."
3715 check_mount_and_prep
3717 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3718 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3719 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3720 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3722 echo "Inject failure stub on MDT0 to simulate the case that"
3723 echo "foo's hard links exceed the object's linkEA limitation."
3725 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3727 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3728 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3730 cancel_lru_locks mdc
3732 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3733 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3735 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3736 $LFS fid2path $DIR $foofid
3737 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3738 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3740 echo "Trigger namespace LFSCK to repair the nlink count"
3741 $START_NAMESPACE -r -A ||
3742 error "(7) Fail to start LFSCK for namespace"
3744 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3745 mdd.${MDT_DEV}.lfsck_namespace |
3746 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3748 error "(8) unexpected status"
3751 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3752 local repaired=$($SHOW_NAMESPACE |
3753 awk '/^nlinks_repaired/ { print $2 }')
3754 [ $repaired -eq 0 ] ||
3755 error "(9) Repair nlink count unexpcetedly: $repaired"
3757 cancel_lru_locks mdc
3759 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3760 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3762 count2=$($LFS fid2path $DIR $foofid | wc -l)
3763 [ $count2 -eq 2 ] ||
3764 error "(11) Repaired something unexpectedly: $count2"
3766 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3769 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3770 skip "Only support backend /lost+found for ldiskfs" && return
3773 echo "The namespace LFSCK will move the orphans from backend"
3774 echo "/lost+found directory to normal client visible namespace"
3775 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3778 check_mount_and_prep
3780 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3781 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3783 echo "Inject failure stub on MDT0 to simulate the case that"
3784 echo "directory d0 has no linkEA entry, then the LFSCK will"
3785 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3787 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3788 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3789 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3790 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3792 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3793 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3795 echo "Inject failure stub on MDT0 to simulate the case that the"
3796 echo "object's name entry will be removed, but not destroy the"
3797 echo "object. Then backend e2fsck will handle it as orphan and"
3798 echo "add them into the backend /lost+found directory."
3800 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3801 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3802 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3803 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3804 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3805 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3806 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3808 umount_client $MOUNT || error "(10) Fail to stop client!"
3810 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3813 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3814 error "(12) Fail to run e2fsck"
3816 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3817 error "(13) Fail to start MDT0"
3819 echo "Trigger namespace LFSCK to recover backend orphans"
3820 $START_NAMESPACE -r -A ||
3821 error "(14) Fail to start LFSCK for namespace"
3823 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3824 mdd.${MDT_DEV}.lfsck_namespace |
3825 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3827 error "(15) unexpected status"
3830 local repaired=$($SHOW_NAMESPACE |
3831 awk '/^local_lost_found_moved/ { print $2 }')
3832 [ $repaired -ge 4 ] ||
3833 error "(16) Fail to recover backend orphans: $repaired"
3835 mount_client $MOUNT || error "(17) Fail to start client!"
3837 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3839 ls -ail $MOUNT/.lustre/lost+found/
3841 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3842 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3843 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3845 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3847 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3848 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3850 stat ${cname}/d1 || error "(21) d0 is not recovered"
3851 stat ${cname}/f1 || error "(22) f1 is not recovered"
3853 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3856 [ $MDSCOUNT -lt 2 ] &&
3857 skip "The test needs at least 2 MDTs" && return
3860 echo "For the name entry under a striped directory, if the name"
3861 echo "hash does not match the shard, then the LFSCK will repair"
3862 echo "the bad name entry"
3865 check_mount_and_prep
3867 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3868 error "(1) Fail to create striped directory"
3870 echo "Inject failure stub on client to simulate the case that"
3871 echo "some name entry should be inserted into other non-first"
3872 echo "shard, but inserted into the first shard by wrong"
3874 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3875 $LCTL set_param fail_loc=0x1628 fail_val=0
3876 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3877 error "(2) Fail to create file under striped directory"
3878 $LCTL set_param fail_loc=0 fail_val=0
3880 echo "Trigger namespace LFSCK to repair bad name hash"
3881 $START_NAMESPACE -r -A ||
3882 error "(3) Fail to start LFSCK for namespace"
3884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3885 mdd.${MDT_DEV}.lfsck_namespace |
3886 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3888 error "(4) unexpected status"
3891 local repaired=$($SHOW_NAMESPACE |
3892 awk '/^name_hash_repaired/ { print $2 }')
3893 [ $repaired -ge 1 ] ||
3894 error "(5) Fail to repair bad name hash: $repaired"
3896 umount_client $MOUNT || error "(6) umount failed"
3897 mount_client $MOUNT || error "(7) mount failed"
3899 for ((i = 0; i < $MDSCOUNT; i++)); do
3900 stat $DIR/$tdir/striped_dir/d$i ||
3901 error "(8) Fail to stat d$i after LFSCK"
3902 rmdir $DIR/$tdir/striped_dir/d$i ||
3903 error "(9) Fail to unlink d$i after LFSCK"
3906 rmdir $DIR/$tdir/striped_dir ||
3907 error "(10) Fail to remove the striped directory after LFSCK"
3909 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3912 [ $MDSCOUNT -lt 2 ] &&
3913 skip "The test needs at least 2 MDTs" && return
3916 echo "For the name entry under a striped directory, if the name"
3917 echo "hash does not match the shard, then the LFSCK will repair"
3918 echo "the bad name entry"
3921 check_mount_and_prep
3923 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3924 error "(1) Fail to create striped directory"
3926 echo "Inject failure stub on client to simulate the case that"
3927 echo "some name entry should be inserted into other non-second"
3928 echo "shard, but inserted into the secod shard by wrong"
3930 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3931 $LCTL set_param fail_loc=0x1628 fail_val=1
3932 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3933 error "(2) Fail to create file under striped directory"
3934 $LCTL set_param fail_loc=0 fail_val=0
3936 echo "Trigger namespace LFSCK to repair bad name hash"
3937 $START_NAMESPACE -r -A ||
3938 error "(3) Fail to start LFSCK for namespace"
3940 wait_update_facet mds2 "$LCTL get_param -n \
3941 mdd.$(facet_svc mds2).lfsck_namespace |
3942 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3943 error "(4) unexpected status"
3945 local repaired=$(do_facet mds2 $LCTL get_param -n \
3946 mdd.$(facet_svc mds2).lfsck_namespace |
3947 awk '/^name_hash_repaired/ { print $2 }')
3948 [ $repaired -ge 1 ] ||
3949 error "(5) Fail to repair bad name hash: $repaired"
3951 umount_client $MOUNT || error "(6) umount failed"
3952 mount_client $MOUNT || error "(7) mount failed"
3954 for ((i = 0; i < $MDSCOUNT; i++)); do
3955 stat $DIR/$tdir/striped_dir/d$i ||
3956 error "(8) Fail to stat d$i after LFSCK"
3957 rmdir $DIR/$tdir/striped_dir/d$i ||
3958 error "(9) Fail to unlink d$i after LFSCK"
3961 rmdir $DIR/$tdir/striped_dir ||
3962 error "(10) Fail to remove the striped directory after LFSCK"
3964 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3967 [ $MDSCOUNT -lt 2 ] &&
3968 skip "The test needs at least 2 MDTs" && return
3971 echo "For some reason, the master MDT-object of the striped directory"
3972 echo "may lost its master LMV EA. If nobody created files under the"
3973 echo "master directly after the master LMV EA lost, then the LFSCK"
3974 echo "should re-generate the master LMV EA."
3977 check_mount_and_prep
3979 echo "Inject failure stub on MDT0 to simulate the case that the"
3980 echo "master MDT-object of the striped directory lost the LMV EA."
3982 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3984 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3985 error "(1) Fail to create striped directory"
3986 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3988 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3989 $START_NAMESPACE -r -A ||
3990 error "(2) Fail to start LFSCK for namespace"
3992 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3993 mdd.${MDT_DEV}.lfsck_namespace |
3994 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3996 error "(3) unexpected status"
3999 local repaired=$($SHOW_NAMESPACE |
4000 awk '/^striped_dirs_repaired/ { print $2 }')
4001 [ $repaired -eq 1 ] ||
4002 error "(4) Fail to re-generate master LMV EA: $repaired"
4004 umount_client $MOUNT || error "(5) umount failed"
4005 mount_client $MOUNT || error "(6) mount failed"
4007 local empty=$(ls $DIR/$tdir/striped_dir/)
4008 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4010 rmdir $DIR/$tdir/striped_dir ||
4011 error "(8) Fail to remove the striped directory after LFSCK"
4013 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4016 [ $MDSCOUNT -lt 2 ] &&
4017 skip "The test needs at least 2 MDTs" && return
4020 echo "For some reason, the master MDT-object of the striped directory"
4021 echo "may lost its master LMV EA. If somebody created files under the"
4022 echo "master directly after the master LMV EA lost, then the LFSCK"
4023 echo "should NOT re-generate the master LMV EA, instead, it should"
4024 echo "change the broken striped dirctory as read-only to prevent"
4025 echo "further damage"
4028 check_mount_and_prep
4030 echo "Inject failure stub on MDT0 to simulate the case that the"
4031 echo "master MDT-object of the striped directory lost the LMV EA."
4033 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4035 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4036 error "(1) Fail to create striped directory"
4037 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4039 umount_client $MOUNT || error "(2) umount failed"
4040 mount_client $MOUNT || error "(3) mount failed"
4042 touch $DIR/$tdir/striped_dir/dummy ||
4043 error "(4) Fail to touch under broken striped directory"
4045 echo "Trigger namespace LFSCK to find out the inconsistency"
4046 $START_NAMESPACE -r -A ||
4047 error "(5) Fail to start LFSCK for namespace"
4049 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4050 mdd.${MDT_DEV}.lfsck_namespace |
4051 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4053 error "(6) unexpected status"
4056 local repaired=$($SHOW_NAMESPACE |
4057 awk '/^striped_dirs_repaired/ { print $2 }')
4058 [ $repaired -eq 0 ] ||
4059 error "(7) Re-generate master LMV EA unexpected: $repaired"
4061 stat $DIR/$tdir/striped_dir/dummy ||
4062 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4064 touch $DIR/$tdir/striped_dir/foo &&
4065 error "(9) The broken striped directory should be read-only"
4067 chattr -i $DIR/$tdir/striped_dir ||
4068 error "(10) Fail to chattr on the broken striped directory"
4070 rmdir $DIR/$tdir/striped_dir ||
4071 error "(11) Fail to remove the striped directory after LFSCK"
4073 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4076 [ $MDSCOUNT -lt 2 ] &&
4077 skip "The test needs at least 2 MDTs" && return
4080 echo "For some reason, the slave MDT-object of the striped directory"
4081 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4082 echo "slave LMV EA."
4085 check_mount_and_prep
4087 echo "Inject failure stub on MDT0 to simulate the case that the"
4088 echo "slave MDT-object (that resides on the same MDT as the master"
4089 echo "MDT-object resides on) lost the LMV EA."
4091 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4093 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4094 error "(1) Fail to create striped directory"
4095 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4097 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4098 $START_NAMESPACE -r -A ||
4099 error "(2) Fail to start LFSCK for namespace"
4101 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4102 mdd.${MDT_DEV}.lfsck_namespace |
4103 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4105 error "(3) unexpected status"
4108 local repaired=$($SHOW_NAMESPACE |
4109 awk '/^striped_shards_repaired/ { print $2 }')
4110 [ $repaired -eq 1 ] ||
4111 error "(4) Fail to re-generate slave LMV EA: $repaired"
4113 rmdir $DIR/$tdir/striped_dir ||
4114 error "(5) Fail to remove the striped directory after LFSCK"
4116 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4119 [ $MDSCOUNT -lt 2 ] &&
4120 skip "The test needs at least 2 MDTs" && return
4123 echo "For some reason, the slave MDT-object of the striped directory"
4124 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4125 echo "slave LMV EA."
4128 check_mount_and_prep
4130 echo "Inject failure stub on MDT0 to simulate the case that the"
4131 echo "slave MDT-object (that resides on differnt MDT as the master"
4132 echo "MDT-object resides on) lost the LMV EA."
4134 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4135 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4136 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4137 error "(1) Fail to create striped directory"
4138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4140 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4141 $START_NAMESPACE -r -A ||
4142 error "(2) Fail to start LFSCK for namespace"
4144 wait_update_facet mds2 "$LCTL get_param -n \
4145 mdd.$(facet_svc mds2).lfsck_namespace |
4146 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4147 error "(3) unexpected status"
4149 local repaired=$(do_facet mds2 $LCTL get_param -n \
4150 mdd.$(facet_svc mds2).lfsck_namespace |
4151 awk '/^striped_shards_repaired/ { print $2 }')
4152 [ $repaired -eq 1 ] ||
4153 error "(4) Fail to re-generate slave LMV EA: $repaired"
4155 rmdir $DIR/$tdir/striped_dir ||
4156 error "(5) Fail to remove the striped directory after LFSCK"
4158 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4161 [ $MDSCOUNT -lt 2 ] &&
4162 skip "The test needs at least 2 MDTs" && return
4165 echo "For some reason, the stripe index in the slave LMV EA is"
4166 echo "corrupted. The LFSCK should repair the slave LMV EA."
4169 check_mount_and_prep
4171 echo "Inject failure stub on MDT0 to simulate the case that the"
4172 echo "slave LMV EA on the first shard of the striped directory"
4173 echo "claims the same index as the second shard claims"
4175 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4176 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4177 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4178 error "(1) Fail to create striped directory"
4179 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4181 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4182 $START_NAMESPACE -r -A ||
4183 error "(2) Fail to start LFSCK for namespace"
4185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4186 mdd.${MDT_DEV}.lfsck_namespace |
4187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4189 error "(3) unexpected status"
4192 local repaired=$($SHOW_NAMESPACE |
4193 awk '/^striped_shards_repaired/ { print $2 }')
4194 [ $repaired -eq 1 ] ||
4195 error "(4) Fail to repair slave LMV EA: $repaired"
4197 umount_client $MOUNT || error "(5) umount failed"
4198 mount_client $MOUNT || error "(6) mount failed"
4200 touch $DIR/$tdir/striped_dir/foo ||
4201 error "(7) Fail to touch file after the LFSCK"
4203 rm -f $DIR/$tdir/striped_dir/foo ||
4204 error "(8) Fail to unlink file after the LFSCK"
4206 rmdir $DIR/$tdir/striped_dir ||
4207 error "(9) Fail to remove the striped directory after LFSCK"
4209 run_test 31g "Repair the corrupted slave LMV EA"
4212 [ $MDSCOUNT -lt 2 ] &&
4213 skip "The test needs at least 2 MDTs" && return
4216 echo "For some reason, the shard's name entry in the striped"
4217 echo "directory may be corrupted. The LFSCK should repair the"
4218 echo "bad shard's name entry."
4221 check_mount_and_prep
4223 echo "Inject failure stub on MDT0 to simulate the case that the"
4224 echo "first shard's name entry in the striped directory claims"
4225 echo "the same index as the second shard's name entry claims."
4227 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4228 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4229 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4230 error "(1) Fail to create striped directory"
4231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4233 echo "Trigger namespace LFSCK to repair the shard's name entry"
4234 $START_NAMESPACE -r -A ||
4235 error "(2) Fail to start LFSCK for namespace"
4237 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4238 mdd.${MDT_DEV}.lfsck_namespace |
4239 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4241 error "(3) unexpected status"
4244 local repaired=$($SHOW_NAMESPACE |
4245 awk '/^dirent_repaired/ { print $2 }')
4246 [ $repaired -eq 1 ] ||
4247 error "(4) Fail to repair shard's name entry: $repaired"
4249 umount_client $MOUNT || error "(5) umount failed"
4250 mount_client $MOUNT || error "(6) mount failed"
4252 touch $DIR/$tdir/striped_dir/foo ||
4253 error "(7) Fail to touch file after the LFSCK"
4255 rm -f $DIR/$tdir/striped_dir/foo ||
4256 error "(8) Fail to unlink file after the LFSCK"
4258 rmdir $DIR/$tdir/striped_dir ||
4259 error "(9) Fail to remove the striped directory after LFSCK"
4261 run_test 31h "Repair the corrupted shard's name entry"
4263 # restore MDS/OST size
4264 MDSSIZE=${SAVED_MDSSIZE}
4265 OSTSIZE=${SAVED_OSTSIZE}
4266 OSTCOUNT=${SAVED_OSTCOUNT}
4268 # cleanup the system at last