3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
22 SAVED_MDSSIZE=${MDSSIZE}
23 SAVED_OSTSIZE=${OSTSIZE}
24 SAVED_OSTCOUNT=${OSTCOUNT}
25 # use small MDS + OST size to speed formatting time
26 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
29 # no need too much OSTs, to reduce the format/start/stop overhead
30 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
32 # build up a clean test environment.
36 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
37 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
40 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
41 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
43 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
44 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
48 $LCTL set_param debug=+lfsck > /dev/null || true
50 MDT_DEV="${FSNAME}-MDT0000"
51 OST_DEV="${FSNAME}-OST0000"
52 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
53 START_NAMESPACE="do_facet $SINGLEMDS \
54 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
55 START_LAYOUT="do_facet $SINGLEMDS \
56 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
57 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
58 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
59 SHOW_NAMESPACE="do_facet $SINGLEMDS \
60 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
61 SHOW_LAYOUT="do_facet $SINGLEMDS \
62 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
63 SHOW_LAYOUT_ON_OST="do_facet ost1 \
64 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
65 MOUNT_OPTS_SCRUB="-o user_xattr"
66 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
75 echo "preparing... $nfiles * $ndirs files will be created $(date)."
76 if [ ! -z $igif ]; then
77 #define OBD_FAIL_FID_IGIF 0x1504
78 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
81 cp $LUSTRE/tests/*.sh $DIR/$tdir/
82 if [ $ndirs -gt 0 ]; then
83 createmany -d $DIR/$tdir/d $ndirs
84 createmany -m $DIR/$tdir/f $ndirs
85 if [ $nfiles -gt 0 ]; then
86 for ((i = 0; i < $ndirs; i++)); do
87 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
88 /dev/null || error "createmany $nfiles"
91 createmany -d $DIR/$tdir/e $ndirs
94 if [ ! -z $igif ]; then
95 touch $DIR/$tdir/dummy
96 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
99 echo "prepared $(date)."
105 #define OBD_FAIL_LFSCK_DELAY1 0x1600
106 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
107 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
109 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
111 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
112 [ "$STATUS" == "scanning-phase1" ] ||
113 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
115 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
117 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
118 [ "$STATUS" == "stopped" ] ||
119 error "(6) Expect 'stopped', but got '$STATUS'"
121 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
123 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
124 [ "$STATUS" == "scanning-phase1" ] ||
125 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
127 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
128 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
129 mdd.${MDT_DEV}.lfsck_namespace |
130 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
132 error "(9) unexpected status"
135 local repaired=$($SHOW_NAMESPACE |
136 awk '/^updated_phase1/ { print $2 }')
137 [ $repaired -eq 0 ] ||
138 error "(10) Expect nothing to be repaired, but got: $repaired"
140 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
141 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
142 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
143 mdd.${MDT_DEV}.lfsck_namespace |
144 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
146 error "(12) unexpected status"
149 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
150 [ $((scanned1 + 1)) -eq $scanned2 ] ||
151 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
153 echo "stopall, should NOT crash LU-3649"
154 stopall || error "(14) Fail to stopall"
156 run_test 0 "Control LFSCK manually"
159 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
160 skip "OI Scrub not implemented for ZFS" && return
164 #define OBD_FAIL_FID_INDIR 0x1501
165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
166 touch $DIR/$tdir/dummy
168 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
170 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
171 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
172 mdd.${MDT_DEV}.lfsck_namespace |
173 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
175 error "(4) unexpected status"
178 local repaired=$($SHOW_NAMESPACE |
179 awk '/^dirent_repaired/ { print $2 }')
180 # for interop with old server
181 [ -z "$repaired" ] &&
182 repaired=$($SHOW_NAMESPACE |
183 awk '/^updated_phase1/ { print $2 }')
185 [ $repaired -eq 1 ] ||
186 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
188 mount_client $MOUNT || error "(6) Fail to start client!"
190 #define OBD_FAIL_FID_LOOKUP 0x1505
191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
192 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
196 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
200 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
201 skip "OI Scrub not implemented for ZFS" && return
205 #define OBD_FAIL_FID_INLMA 0x1502
206 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
207 touch $DIR/$tdir/dummy
209 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
211 #define OBD_FAIL_FID_NOLMA 0x1506
212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
213 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
214 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
215 mdd.${MDT_DEV}.lfsck_namespace |
216 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
218 error "(4) unexpected status"
221 local repaired=$($SHOW_NAMESPACE |
222 awk '/^dirent_repaired/ { print $2 }')
223 # for interop with old server
224 [ -z "$repaired" ] &&
225 repaired=$($SHOW_NAMESPACE |
226 awk '/^updated_phase1/ { print $2 }')
228 [ $repaired -eq 1 ] ||
229 error "(5) Fail to repair missed FID-in-LMA: $repaired"
231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
232 mount_client $MOUNT || error "(6) Fail to start client!"
234 #define OBD_FAIL_FID_LOOKUP 0x1505
235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
236 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
240 run_test 1b "LFSCK can find out and repair missed FID-in-LMA"
245 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
246 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
247 touch $DIR/$tdir/dummy
249 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
251 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
252 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
253 mdd.${MDT_DEV}.lfsck_namespace |
254 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
256 error "(4) unexpected status"
259 local repaired=$($SHOW_NAMESPACE |
260 awk '/^linkea_repaired/ { print $2 }')
261 # for interop with old server
262 [ -z "$repaired" ] &&
263 repaired=$($SHOW_NAMESPACE |
264 awk '/^updated_phase1/ { print $2 }')
266 [ $repaired -eq 1 ] ||
267 error "(5) Fail to repair crashed linkEA: $repaired"
269 mount_client $MOUNT || error "(6) Fail to start client!"
271 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
272 error "(7) Fail to stat $DIR/$tdir/dummy"
274 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
275 local dummyname=$($LFS fid2path $DIR $dummyfid)
276 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
277 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
279 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
285 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
286 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
287 touch $DIR/$tdir/dummy
289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
291 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
292 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
293 mdd.${MDT_DEV}.lfsck_namespace |
294 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
296 error "(4) unexpected status"
299 local repaired=$($SHOW_NAMESPACE |
300 awk '/^updated_phase2/ { print $2 }')
301 [ $repaired -eq 1 ] ||
302 error "(5) Fail to repair crashed linkEA: $repaired"
304 mount_client $MOUNT || error "(6) Fail to start client!"
306 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
307 error "(7) Fail to stat $DIR/$tdir/dummy"
309 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
310 local dummyname=$($LFS fid2path $DIR $dummyfid)
311 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
312 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
314 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
320 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
321 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
322 touch $DIR/$tdir/dummy
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
326 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
327 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
328 mdd.${MDT_DEV}.lfsck_namespace |
329 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
331 error "(4) unexpected status"
334 local repaired=$($SHOW_NAMESPACE |
335 awk '/^updated_phase2/ { print $2 }')
336 [ $repaired -eq 1 ] ||
337 error "(5) Fail to repair crashed linkEA: $repaired"
339 mount_client $MOUNT || error "(6) Fail to start client!"
341 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
342 error "(7) Fail to stat $DIR/$tdir/dummy"
344 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
345 local dummyname=$($LFS fid2path $DIR $dummyfid)
346 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
347 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
349 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
353 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
354 skip "OI Scrub not implemented for ZFS" && return
357 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
358 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
360 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
361 echo "start $SINGLEMDS with disabling OI scrub"
362 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
363 error "(2) Fail to start MDS!"
365 #define OBD_FAIL_LFSCK_DELAY2 0x1601
366 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
367 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
368 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
369 mdd.${MDT_DEV}.lfsck_namespace |
370 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
372 error "(5) unexpected status"
375 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
376 [ "$STATUS" == "scanning-phase1" ] ||
377 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
380 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
381 mdd.${MDT_DEV}.lfsck_namespace |
382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
384 error "(7) unexpected status"
387 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
388 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
390 local repaired=$($SHOW_NAMESPACE |
391 awk '/^dirent_repaired/ { print $2 }')
392 # for interop with old server
393 [ -z "$repaired" ] &&
394 repaired=$($SHOW_NAMESPACE |
395 awk '/^updated_phase1/ { print $2 }')
397 [ $repaired -ge 9 ] ||
398 error "(9) Fail to re-generate FID-in-dirent: $repaired"
400 mount_client $MOUNT || error "(10) Fail to start client!"
402 #define OBD_FAIL_FID_LOOKUP 0x1505
403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
404 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
405 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
407 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
411 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
412 skip "OI Scrub not implemented for ZFS" && return
415 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
416 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
418 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
419 echo "start $SINGLEMDS with disabling OI scrub"
420 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
421 error "(2) Fail to start MDS!"
423 #define OBD_FAIL_LFSCK_DELAY2 0x1601
424 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
425 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
426 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
427 mdd.${MDT_DEV}.lfsck_namespace |
428 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
430 error "(5) unexpected status"
433 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
434 [ "$STATUS" == "scanning-phase1" ] ||
435 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
438 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
439 mdd.${MDT_DEV}.lfsck_namespace |
440 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
442 error "(7) unexpected status"
445 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
446 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
448 local repaired=$($SHOW_NAMESPACE |
449 awk '/^dirent_repaired/ { print $2 }')
450 # for interop with old server
451 [ -z "$repaired" ] &&
452 repaired=$($SHOW_NAMESPACE |
453 awk '/^updated_phase1/ { print $2 }')
455 [ $repaired -ge 2 ] ||
456 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
458 mount_client $MOUNT || error "(10) Fail to start client!"
460 #define OBD_FAIL_FID_LOOKUP 0x1505
461 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
462 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
464 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
468 local dummyname=$($LFS fid2path $DIR $dummyfid)
469 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
470 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
472 run_test 5 "LFSCK can handle IGIF object upgrading"
477 #define OBD_FAIL_LFSCK_DELAY1 0x1600
478 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
479 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
481 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
482 [ "$STATUS" == "scanning-phase1" ] ||
483 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
485 # Sleep 3 sec to guarantee at least one object processed by LFSCK
487 # Fail the LFSCK to guarantee there is at least one checkpoint
488 #define OBD_FAIL_LFSCK_FATAL1 0x1608
489 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
490 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
491 mdd.${MDT_DEV}.lfsck_namespace |
492 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
494 error "(4) unexpected status"
497 local POS0=$($SHOW_NAMESPACE |
498 awk '/^last_checkpoint_position/ { print $2 }' |
501 #define OBD_FAIL_LFSCK_DELAY1 0x1600
502 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
503 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
505 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
506 [ "$STATUS" == "scanning-phase1" ] ||
507 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
509 local POS1=$($SHOW_NAMESPACE |
510 awk '/^latest_start_position/ { print $2 }' |
512 [[ $POS0 -lt $POS1 ]] ||
513 error "(7) Expect larger than: $POS0, but got $POS1"
515 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
516 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
517 mdd.${MDT_DEV}.lfsck_namespace |
518 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
520 error "(8) unexpected status"
523 run_test 6a "LFSCK resumes from last checkpoint (1)"
528 #define OBD_FAIL_LFSCK_DELAY2 0x1601
529 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
530 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
532 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
533 [ "$STATUS" == "scanning-phase1" ] ||
534 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
536 # Sleep 5 sec to guarantee that we are in the directory scanning
538 # Fail the LFSCK to guarantee there is at least one checkpoint
539 #define OBD_FAIL_LFSCK_FATAL2 0x1609
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
541 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
542 mdd.${MDT_DEV}.lfsck_namespace |
543 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
545 error "(4) unexpected status"
548 local O_POS0=$($SHOW_NAMESPACE |
549 awk '/^last_checkpoint_position/ { print $2 }' |
552 local D_POS0=$($SHOW_NAMESPACE |
553 awk '/^last_checkpoint_position/ { print $4 }')
555 #define OBD_FAIL_LFSCK_DELAY2 0x1601
556 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
557 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
559 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
560 [ "$STATUS" == "scanning-phase1" ] ||
561 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
563 local O_POS1=$($SHOW_NAMESPACE |
564 awk '/^latest_start_position/ { print $2 }' |
566 local D_POS1=$($SHOW_NAMESPACE |
567 awk '/^latest_start_position/ { print $4 }')
569 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
570 [[ $O_POS0 -lt $O_POS1 ]] ||
571 error "(7.1) $O_POS1 is not larger than $O_POS0"
573 [[ $D_POS0 -lt $D_POS1 ]] ||
574 error "(7.2) $D_POS1 is not larger than $D_POS0"
577 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
578 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
579 mdd.${MDT_DEV}.lfsck_namespace |
580 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
582 error "(8) unexpected status"
585 run_test 6b "LFSCK resumes from last checkpoint (2)"
592 #define OBD_FAIL_LFSCK_DELAY2 0x1601
593 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
594 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
596 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
597 [ "$STATUS" == "scanning-phase1" ] ||
598 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
600 # Sleep 3 sec to guarantee at least one object processed by LFSCK
602 echo "stop $SINGLEMDS"
603 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
605 echo "start $SINGLEMDS"
606 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
607 error "(5) Fail to start MDS!"
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
610 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
611 mdd.${MDT_DEV}.lfsck_namespace |
612 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
614 error "(6) unexpected status"
617 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
623 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
624 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
625 for ((i = 0; i < 20; i++)); do
626 touch $DIR/$tdir/dummy${i}
629 #define OBD_FAIL_LFSCK_DELAY3 0x1602
630 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
631 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
632 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
633 mdd.${MDT_DEV}.lfsck_namespace |
634 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
636 error "(4) unexpected status"
639 echo "stop $SINGLEMDS"
640 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
642 echo "start $SINGLEMDS"
643 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
644 error "(6) Fail to start MDS!"
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
647 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
648 mdd.${MDT_DEV}.lfsck_namespace |
649 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
651 error "(7) unexpected status"
654 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
659 formatall > /dev/null
665 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
666 [ "$STATUS" == "init" ] ||
667 error "(2) Expect 'init', but got '$STATUS'"
669 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
671 mkdir $DIR/$tdir/crashed
673 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
674 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
675 for ((i = 0; i < 5; i++)); do
676 touch $DIR/$tdir/dummy${i}
679 umount_client $MOUNT || error "(3) Fail to stop client!"
681 #define OBD_FAIL_LFSCK_DELAY2 0x1601
682 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
683 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
689 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
691 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
692 [ "$STATUS" == "stopped" ] ||
693 error "(7) Expect 'stopped', but got '$STATUS'"
695 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
697 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
698 [ "$STATUS" == "scanning-phase1" ] ||
699 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
701 #define OBD_FAIL_LFSCK_FATAL2 0x1609
702 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
703 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
704 mdd.${MDT_DEV}.lfsck_namespace |
705 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
707 error "(10) unexpected status"
710 #define OBD_FAIL_LFSCK_DELAY1 0x1600
711 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
712 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
714 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
715 [ "$STATUS" == "scanning-phase1" ] ||
716 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
718 #define OBD_FAIL_LFSCK_CRASH 0x160a
719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
722 echo "stop $SINGLEMDS"
723 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
725 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
728 echo "start $SINGLEMDS"
729 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
730 error "(14) Fail to start MDS!"
732 local timeout=$(max_recovery_time)
735 while [ $timer -lt $timeout ]; do
736 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
737 mdt.${MDT_DEV}.recovery_status |
738 awk '/^status/ { print \\\$2 }'")
739 [ "$STATUS" != "RECOVERING" ] && break;
744 [ $timer != $timeout ] ||
745 error "(14.1) recovery timeout"
747 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
748 [ "$STATUS" == "crashed" ] ||
749 error "(15) Expect 'crashed', but got '$STATUS'"
751 #define OBD_FAIL_LFSCK_DELAY2 0x1601
752 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
753 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
755 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
756 [ "$STATUS" == "scanning-phase1" ] ||
757 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
759 echo "stop $SINGLEMDS"
760 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
762 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
763 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
765 echo "start $SINGLEMDS"
766 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
767 error "(19) Fail to start MDS!"
770 while [ $timer -lt $timeout ]; do
771 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
772 mdt.${MDT_DEV}.recovery_status |
773 awk '/^status/ { print \\\$2 }'")
774 [ "$STATUS" != "RECOVERING" ] && break;
779 [ $timer != $timeout ] ||
780 error "(19.1) recovery timeout"
782 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
783 [ "$STATUS" == "paused" ] ||
784 error "(20) Expect 'paused', but got '$STATUS'"
786 #define OBD_FAIL_LFSCK_DELAY3 0x1602
787 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
789 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
794 error "(22) unexpected status"
797 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
798 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
799 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
801 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
802 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
803 mdd.${MDT_DEV}.lfsck_namespace |
804 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
806 error "(24) unexpected status"
809 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
810 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
812 run_test 8 "LFSCK state machine"
815 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
816 skip "Testing on UP system, the speed may be inaccurate."
822 local BASE_SPEED1=100
824 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
827 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
828 [ "$STATUS" == "scanning-phase1" ] ||
829 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
831 local SPEED=$($SHOW_NAMESPACE |
832 awk '/^average_speed_phase1/ { print $2 }')
834 # There may be time error, normally it should be less than 2 seconds.
835 # We allow another 20% schedule error.
837 # MAX_MARGIN = 1.2 = 12 / 10
838 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
839 RUN_TIME1 * 12 / 10))
840 [ $SPEED -lt $MAX_SPEED ] ||
841 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
844 local BASE_SPEED2=300
846 do_facet $SINGLEMDS \
847 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
850 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
851 # MIN_MARGIN = 0.8 = 8 / 10
852 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
853 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
854 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
855 # Account for slow ZFS performance - LU-4934
856 [ $SPEED -gt $MIN_SPEED ] || [ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
857 error "(5) Got speed $SPEED, expected more than $MIN_SPEED"
859 # MAX_MARGIN = 1.2 = 12 / 10
860 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
861 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
862 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
863 [ $SPEED -lt $MAX_SPEED ] ||
864 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
866 do_facet $SINGLEMDS \
867 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
869 wait_update_facet $SINGLEMDS \
870 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
871 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
872 error "(7) Failed to get expected 'completed'"
874 run_test 9a "LFSCK speed control (1)"
877 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
878 skip "Testing on UP system, the speed may be inaccurate."
884 echo "Preparing another 50 * 50 files (with error) at $(date)."
885 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
887 createmany -d $DIR/$tdir/d 50
888 createmany -m $DIR/$tdir/f 50
889 for ((i = 0; i < 50; i++)); do
890 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
893 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
895 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
896 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
897 mdd.${MDT_DEV}.lfsck_namespace |
898 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
900 error "(5) unexpected status"
903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
904 echo "Prepared at $(date)."
908 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
911 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
912 [ "$STATUS" == "scanning-phase2" ] ||
913 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
915 local SPEED=$($SHOW_NAMESPACE |
916 awk '/^average_speed_phase2/ { print $2 }')
917 # There may be time error, normally it should be less than 2 seconds.
918 # We allow another 20% schedule error.
920 # MAX_MARGIN = 1.2 = 12 / 10
921 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
922 RUN_TIME1 * 12 / 10))
923 [ $SPEED -lt $MAX_SPEED ] ||
924 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
927 local BASE_SPEED2=150
929 do_facet $SINGLEMDS \
930 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
933 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
934 # MIN_MARGIN = 0.8 = 8 / 10
935 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
936 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
937 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
938 [ $SPEED -gt $MIN_SPEED ] ||[ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
939 error "(9) Got speed $SPEED, expected more than $MIN_SPEED"
941 # MAX_MARGIN = 1.2 = 12 / 10
942 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
943 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
944 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
945 [ $SPEED -lt $MAX_SPEED ] ||
946 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
948 do_facet $SINGLEMDS \
949 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
950 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
951 mdd.${MDT_DEV}.lfsck_namespace |
952 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
954 error "(11) unexpected status"
957 run_test 9b "LFSCK speed control (2)"
961 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
962 skip "lookup(..)/linkea on ZFS issue" && return
966 echo "Preparing more files with error at $(date)."
967 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
968 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
970 for ((i = 0; i < 1000; i = $((i+2)))); do
971 mkdir -p $DIR/$tdir/d${i}
972 touch $DIR/$tdir/f${i}
973 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
976 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
979 for ((i = 1; i < 1000; i = $((i+2)))); do
980 mkdir -p $DIR/$tdir/d${i}
981 touch $DIR/$tdir/f${i}
982 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
986 echo "Prepared at $(date)."
988 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
991 mount_client $MOUNT || error "(3) Fail to start client!"
993 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1000 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1002 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1004 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1006 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1008 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1010 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1012 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1014 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1015 error "(14) Fail to softlink!"
1017 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1018 [ "$STATUS" == "scanning-phase1" ] ||
1019 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1021 do_facet $SINGLEMDS \
1022 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1023 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1024 mdd.${MDT_DEV}.lfsck_namespace |
1025 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1027 error "(16) unexpected status"
1030 run_test 10 "System is available during LFSCK scanning"
1033 ost_remove_lastid() {
1036 local rcmd="do_facet ost${ost}"
1038 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1040 # step 1: local mount
1041 mount_fstype ost${ost} || return 1
1042 # step 2: remove the specified LAST_ID
1043 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1045 unmount_fstype ost${ost} || return 2
1049 check_mount_and_prep
1050 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1051 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1056 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1058 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1059 error "(2) Fail to start ost1"
1061 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1062 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1064 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1065 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1067 wait_update_facet ost1 "$LCTL get_param -n \
1068 obdfilter.${OST_DEV}.lfsck_layout |
1069 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1071 error "(5) unexpected status"
1074 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1076 wait_update_facet ost1 "$LCTL get_param -n \
1077 obdfilter.${OST_DEV}.lfsck_layout |
1078 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1080 error "(6) unexpected status"
1083 echo "the LAST_ID(s) should have been rebuilt"
1084 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1085 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1087 run_test 11a "LFSCK can rebuild lost last_id"
1090 check_mount_and_prep
1091 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1093 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1094 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1095 do_facet ost1 $LCTL set_param fail_loc=0x160d
1096 createmany -o $DIR/$tdir/f 64
1097 local lastid1=$(do_facet ost1 "lctl get_param -n \
1098 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1099 awk -F: '{ print $2 }')
1101 umount_client $MOUNT
1102 stop ost1 || error "(1) Fail to stop ost1"
1104 #define OBD_FAIL_OST_ENOSPC 0x215
1105 do_facet ost1 $LCTL set_param fail_loc=0x215
1107 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1108 error "(2) Fail to start ost1"
1110 for ((i = 0; i < 60; i++)); do
1111 lastid2=$(do_facet ost1 "lctl get_param -n \
1112 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1113 awk -F: '{ print $2 }')
1114 [ ! -z $lastid2 ] && break;
1118 echo "the on-disk LAST_ID should be smaller than the expected one"
1119 [ $lastid1 -gt $lastid2 ] ||
1120 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1122 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1123 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1125 wait_update_facet ost1 "$LCTL get_param -n \
1126 obdfilter.${OST_DEV}.lfsck_layout |
1127 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1129 error "(6) unexpected status"
1132 stop ost1 || error "(7) Fail to stop ost1"
1134 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1135 error "(8) Fail to start ost1"
1137 echo "the on-disk LAST_ID should have been rebuilt"
1138 wait_update_facet ost1 "$LCTL get_param -n \
1139 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1140 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1141 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1142 error "(9) expect lastid1 0x100000000:$lastid1"
1145 do_facet ost1 $LCTL set_param fail_loc=0
1146 stopall || error "(10) Fail to stopall"
1148 run_test 11b "LFSCK can rebuild crashed last_id"
1151 [ $MDSCOUNT -lt 2 ] &&
1152 skip "We need at least 2 MDSes for test_12" && exit 0
1154 check_mount_and_prep
1155 for k in $(seq $MDSCOUNT); do
1156 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1157 createmany -o $DIR/$tdir/${k}/f 100 ||
1158 error "(0) Fail to create 100 files."
1161 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1162 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1163 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1165 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1166 for k in $(seq $MDSCOUNT); do
1167 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1168 mdd.$(facet_svc mds${k}).lfsck_namespace |
1169 awk '/^status/ { print $2 }')
1170 [ "$STATUS" == "scanning-phase1" ] ||
1171 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1174 echo "Stop namespace LFSCK on all targets by single lctl command."
1175 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1176 error "(4) Fail to stop LFSCK on all devices!"
1178 echo "All the LFSCK targets should be in 'stopped' status."
1179 for k in $(seq $MDSCOUNT); do
1180 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1181 mdd.$(facet_svc mds${k}).lfsck_namespace |
1182 awk '/^status/ { print $2 }')
1183 [ "$STATUS" == "stopped" ] ||
1184 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1187 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1188 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1189 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1191 echo "All the LFSCK targets should be in 'completed' status."
1192 for k in $(seq $MDSCOUNT); do
1193 wait_update_facet mds${k} "$LCTL get_param -n \
1194 mdd.$(facet_svc mds${k}).lfsck_namespace |
1195 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1196 error "(7) MDS${k} is not the expected 'completed'"
1199 echo "Start layout LFSCK on all targets by single command (-s 1)."
1200 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1201 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1203 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1204 for k in $(seq $MDSCOUNT); do
1205 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1206 mdd.$(facet_svc mds${k}).lfsck_layout |
1207 awk '/^status/ { print $2 }')
1208 [ "$STATUS" == "scanning-phase1" ] ||
1209 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1212 echo "Stop layout LFSCK on all targets by single lctl command."
1213 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1214 error "(10) Fail to stop LFSCK on all devices!"
1216 echo "All the LFSCK targets should be in 'stopped' status."
1217 for k in $(seq $MDSCOUNT); do
1218 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1219 mdd.$(facet_svc mds${k}).lfsck_layout |
1220 awk '/^status/ { print $2 }')
1221 [ "$STATUS" == "stopped" ] ||
1222 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1225 for k in $(seq $OSTCOUNT); do
1226 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1227 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1228 awk '/^status/ { print $2 }')
1229 [ "$STATUS" == "stopped" ] ||
1230 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1233 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1234 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1235 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1237 echo "All the LFSCK targets should be in 'completed' status."
1238 for k in $(seq $MDSCOUNT); do
1239 # The LFSCK status query internal is 30 seconds. For the case
1240 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1241 # time to guarantee the status sync up.
1242 wait_update_facet mds${k} "$LCTL get_param -n \
1243 mdd.$(facet_svc mds${k}).lfsck_layout |
1244 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1245 error "(14) MDS${k} is not the expected 'completed'"
1248 run_test 12 "single command to trigger LFSCK on all devices"
1252 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1253 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1254 echo "MDT-object FID."
1257 check_mount_and_prep
1259 echo "Inject failure stub to simulate bad lmm_oi"
1260 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1261 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1262 createmany -o $DIR/$tdir/f 32
1263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1265 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1266 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1268 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1269 mdd.${MDT_DEV}.lfsck_layout |
1270 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1272 error "(2) unexpected status"
1275 local repaired=$($SHOW_LAYOUT |
1276 awk '/^repaired_others/ { print $2 }')
1277 [ $repaired -eq 32 ] ||
1278 error "(3) Fail to repair crashed lmm_oi: $repaired"
1280 run_test 13 "LFSCK can repair crashed lmm_oi"
1284 echo "The OST-object referenced by the MDT-object should be there;"
1285 echo "otherwise, the LFSCK should re-create the missed OST-object."
1288 check_mount_and_prep
1289 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1291 local count=$(precreated_ost_obj_count 0 0)
1293 echo "Inject failure stub to simulate dangling referenced MDT-object"
1294 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1295 do_facet ost1 $LCTL set_param fail_loc=0x1610
1296 createmany -o $DIR/$tdir/f $((count + 31))
1297 touch $DIR/$tdir/guard
1298 do_facet ost1 $LCTL set_param fail_loc=0
1300 start_full_debug_logging
1302 # exhaust other pre-created dangling cases
1303 count=$(precreated_ost_obj_count 0 0)
1304 createmany -o $DIR/$tdir/a $count ||
1305 error "(0) Fail to create $count files."
1307 echo "'ls' should fail because of dangling referenced MDT-object"
1308 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1310 echo "Trigger layout LFSCK to find out dangling reference"
1311 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1314 mdd.${MDT_DEV}.lfsck_layout |
1315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1317 error "(3) unexpected status"
1320 local repaired=$($SHOW_LAYOUT |
1321 awk '/^repaired_dangling/ { print $2 }')
1322 [ $repaired -ge 32 ] ||
1323 error "(4) Fail to repair dangling reference: $repaired"
1325 echo "'stat' should fail because of not repair dangling by default"
1326 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1328 echo "Trigger layout LFSCK to repair dangling reference"
1329 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1331 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1332 mdd.${MDT_DEV}.lfsck_layout |
1333 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1335 error "(7) unexpected status"
1338 # There may be some async LFSCK updates in processing, wait for
1339 # a while until the target reparation has been done. LU-4970.
1341 echo "'stat' should success after layout LFSCK repairing"
1342 wait_update_facet client "stat $DIR/$tdir/guard |
1343 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1344 stat $DIR/$tdir/guard
1346 error "(8) unexpected size"
1349 repaired=$($SHOW_LAYOUT |
1350 awk '/^repaired_dangling/ { print $2 }')
1351 [ $repaired -ge 32 ] ||
1352 error "(9) Fail to repair dangling reference: $repaired"
1354 stop_full_debug_logging
1356 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1360 echo "If the OST-object referenced by the MDT-object back points"
1361 echo "to some non-exist MDT-object, then the LFSCK should repair"
1362 echo "the OST-object to back point to the right MDT-object."
1365 check_mount_and_prep
1366 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1368 echo "Inject failure stub to make the OST-object to back point to"
1369 echo "non-exist MDT-object."
1370 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1372 do_facet ost1 $LCTL set_param fail_loc=0x1611
1373 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1374 cancel_lru_locks osc
1375 do_facet ost1 $LCTL set_param fail_loc=0
1377 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1378 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1380 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1381 mdd.${MDT_DEV}.lfsck_layout |
1382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1384 error "(2) unexpected status"
1387 local repaired=$($SHOW_LAYOUT |
1388 awk '/^repaired_unmatched_pair/ { print $2 }')
1389 [ $repaired -eq 1 ] ||
1390 error "(3) Fail to repair unmatched pair: $repaired"
1392 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1396 echo "If the OST-object referenced by the MDT-object back points"
1397 echo "to other MDT-object that doesn't recognize the OST-object,"
1398 echo "then the LFSCK should repair it to back point to the right"
1399 echo "MDT-object (the first one)."
1402 check_mount_and_prep
1403 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1404 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1405 cancel_lru_locks osc
1407 echo "Inject failure stub to make the OST-object to back point to"
1408 echo "other MDT-object"
1410 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1411 do_facet ost1 $LCTL set_param fail_loc=0x1612
1412 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1413 cancel_lru_locks osc
1414 do_facet ost1 $LCTL set_param fail_loc=0
1416 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1417 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1419 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1420 mdd.${MDT_DEV}.lfsck_layout |
1421 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1423 error "(2) unexpected status"
1426 local repaired=$($SHOW_LAYOUT |
1427 awk '/^repaired_unmatched_pair/ { print $2 }')
1428 [ $repaired -eq 1 ] ||
1429 error "(3) Fail to repair unmatched pair: $repaired"
1431 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1435 echo "If the OST-object's owner information does not match the owner"
1436 echo "information stored in the MDT-object, then the LFSCK trust the"
1437 echo "MDT-object and update the OST-object's owner information."
1440 check_mount_and_prep
1441 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1442 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1443 cancel_lru_locks osc
1445 echo "Inject failure stub to skip OST-object owner changing"
1446 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1448 chown 1.1 $DIR/$tdir/f0
1449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1451 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1454 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1456 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1457 mdd.${MDT_DEV}.lfsck_layout |
1458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1460 error "(2) unexpected status"
1463 local repaired=$($SHOW_LAYOUT |
1464 awk '/^repaired_inconsistent_owner/ { print $2 }')
1465 [ $repaired -eq 1 ] ||
1466 error "(3) Fail to repair inconsistent owner: $repaired"
1468 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1472 echo "If more than one MDT-objects reference the same OST-object,"
1473 echo "and the OST-object only recognizes one MDT-object, then the"
1474 echo "LFSCK should create new OST-objects for such non-recognized"
1478 check_mount_and_prep
1479 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1481 echo "Inject failure stub to make two MDT-objects to refernce"
1482 echo "the OST-object"
1484 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1485 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1487 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1488 cancel_lru_locks osc
1490 createmany -o $DIR/$tdir/f 1
1492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1494 cancel_lru_locks mdc
1495 cancel_lru_locks osc
1497 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1498 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1499 [ $size -eq 1048576 ] ||
1500 error "(1) f0 (wrong) size should be 1048576, but got $size"
1502 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1505 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1507 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1508 mdd.${MDT_DEV}.lfsck_layout |
1509 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1511 error "(3) unexpected status"
1514 local repaired=$($SHOW_LAYOUT |
1515 awk '/^repaired_multiple_referenced/ { print $2 }')
1516 [ $repaired -eq 1 ] ||
1517 error "(4) Fail to repair multiple references: $repaired"
1519 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1520 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1521 error "(5) Fail to write f0."
1522 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1523 [ $size -eq 1048576 ] ||
1524 error "(6) guard size should be 1048576, but got $size"
1526 run_test 17 "LFSCK can repair multiple references"
1530 echo "The target MDT-object is there, but related stripe information"
1531 echo "is lost or partly lost. The LFSCK should regenerate the missed"
1532 echo "layout EA entries."
1535 check_mount_and_prep
1536 $LFS mkdir -i 0 $DIR/$tdir/a1
1537 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1538 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1540 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1542 $LFS path2fid $DIR/$tdir/a1/f1
1543 $LFS getstripe $DIR/$tdir/a1/f1
1545 if [ $MDSCOUNT -ge 2 ]; then
1546 $LFS mkdir -i 1 $DIR/$tdir/a2
1547 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1548 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1549 $LFS path2fid $DIR/$tdir/a2/f2
1550 $LFS getstripe $DIR/$tdir/a2/f2
1553 cancel_lru_locks osc
1555 echo "Inject failure, to make the MDT-object lost its layout EA"
1556 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1557 do_facet mds1 $LCTL set_param fail_loc=0x1615
1558 chown 1.1 $DIR/$tdir/a1/f1
1560 if [ $MDSCOUNT -ge 2 ]; then
1561 do_facet mds2 $LCTL set_param fail_loc=0x1615
1562 chown 1.1 $DIR/$tdir/a2/f2
1568 do_facet mds1 $LCTL set_param fail_loc=0
1569 if [ $MDSCOUNT -ge 2 ]; then
1570 do_facet mds2 $LCTL set_param fail_loc=0
1573 cancel_lru_locks mdc
1574 cancel_lru_locks osc
1576 echo "The file size should be incorrect since layout EA is lost"
1577 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1578 [ "$cur_size" != "$saved_size" ] ||
1579 error "(1) Expect incorrect file1 size"
1581 if [ $MDSCOUNT -ge 2 ]; then
1582 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1583 [ "$cur_size" != "$saved_size" ] ||
1584 error "(2) Expect incorrect file2 size"
1587 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1588 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1590 for k in $(seq $MDSCOUNT); do
1591 # The LFSCK status query internal is 30 seconds. For the case
1592 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1593 # time to guarantee the status sync up.
1594 wait_update_facet mds${k} "$LCTL get_param -n \
1595 mdd.$(facet_svc mds${k}).lfsck_layout |
1596 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1597 error "(4) MDS${k} is not the expected 'completed'"
1600 for k in $(seq $OSTCOUNT); do
1601 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1602 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1603 awk '/^status/ { print $2 }')
1604 [ "$cur_status" == "completed" ] ||
1605 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1608 local repaired=$(do_facet mds1 $LCTL get_param -n \
1609 mdd.$(facet_svc mds1).lfsck_layout |
1610 awk '/^repaired_orphan/ { print $2 }')
1611 [ $repaired -eq 1 ] ||
1612 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1614 if [ $MDSCOUNT -ge 2 ]; then
1615 repaired=$(do_facet mds2 $LCTL get_param -n \
1616 mdd.$(facet_svc mds2).lfsck_layout |
1617 awk '/^repaired_orphan/ { print $2 }')
1618 [ $repaired -eq 2 ] ||
1619 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1622 $LFS path2fid $DIR/$tdir/a1/f1
1623 $LFS getstripe $DIR/$tdir/a1/f1
1625 if [ $MDSCOUNT -ge 2 ]; then
1626 $LFS path2fid $DIR/$tdir/a2/f2
1627 $LFS getstripe $DIR/$tdir/a2/f2
1630 echo "The file size should be correct after layout LFSCK scanning"
1631 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1632 [ "$cur_size" == "$saved_size" ] ||
1633 error "(7) Expect file1 size $saved_size, but got $cur_size"
1635 if [ $MDSCOUNT -ge 2 ]; then
1636 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1637 [ "$cur_size" == "$saved_size" ] ||
1638 error "(8) Expect file2 size $saved_size, but got $cur_size"
1641 run_test 18a "Find out orphan OST-object and repair it (1)"
1645 echo "The target MDT-object is lost. The LFSCK should re-create the"
1646 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1647 echo "can move it back to normal namespace manually."
1650 check_mount_and_prep
1651 $LFS mkdir -i 0 $DIR/$tdir/a1
1652 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1653 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1654 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1655 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1657 $LFS getstripe $DIR/$tdir/a1/f1
1659 if [ $MDSCOUNT -ge 2 ]; then
1660 $LFS mkdir -i 1 $DIR/$tdir/a2
1661 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1662 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1663 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1665 $LFS getstripe $DIR/$tdir/a2/f2
1668 cancel_lru_locks osc
1670 echo "Inject failure, to simulate the case of missing the MDT-object"
1671 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1672 do_facet mds1 $LCTL set_param fail_loc=0x1616
1673 rm -f $DIR/$tdir/a1/f1
1675 if [ $MDSCOUNT -ge 2 ]; then
1676 do_facet mds2 $LCTL set_param fail_loc=0x1616
1677 rm -f $DIR/$tdir/a2/f2
1683 do_facet mds1 $LCTL set_param fail_loc=0
1684 if [ $MDSCOUNT -ge 2 ]; then
1685 do_facet mds2 $LCTL set_param fail_loc=0
1688 cancel_lru_locks mdc
1689 cancel_lru_locks osc
1691 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1692 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1694 for k in $(seq $MDSCOUNT); do
1695 # The LFSCK status query internal is 30 seconds. For the case
1696 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1697 # time to guarantee the status sync up.
1698 wait_update_facet mds${k} "$LCTL get_param -n \
1699 mdd.$(facet_svc mds${k}).lfsck_layout |
1700 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1701 error "(2) MDS${k} is not the expected 'completed'"
1704 for k in $(seq $OSTCOUNT); do
1705 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1706 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1707 awk '/^status/ { print $2 }')
1708 [ "$cur_status" == "completed" ] ||
1709 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1712 local repaired=$(do_facet mds1 $LCTL get_param -n \
1713 mdd.$(facet_svc mds1).lfsck_layout |
1714 awk '/^repaired_orphan/ { print $2 }')
1715 [ $repaired -eq 1 ] ||
1716 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1718 if [ $MDSCOUNT -ge 2 ]; then
1719 repaired=$(do_facet mds2 $LCTL get_param -n \
1720 mdd.$(facet_svc mds2).lfsck_layout |
1721 awk '/^repaired_orphan/ { print $2 }')
1722 [ $repaired -eq 2 ] ||
1723 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1726 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1727 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1728 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1730 if [ $MDSCOUNT -ge 2 ]; then
1731 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1732 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1735 $LFS path2fid $DIR/$tdir/a1/f1
1736 $LFS getstripe $DIR/$tdir/a1/f1
1738 if [ $MDSCOUNT -ge 2 ]; then
1739 $LFS path2fid $DIR/$tdir/a2/f2
1740 $LFS getstripe $DIR/$tdir/a2/f2
1743 echo "The file size should be correct after layout LFSCK scanning"
1744 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1745 [ "$cur_size" == "$saved_size" ] ||
1746 error "(7) Expect file1 size $saved_size, but got $cur_size"
1748 if [ $MDSCOUNT -ge 2 ]; then
1749 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1750 [ "$cur_size" == "$saved_size" ] ||
1751 error "(8) Expect file2 size $saved_size, but got $cur_size"
1754 run_test 18b "Find out orphan OST-object and repair it (2)"
1758 echo "The target MDT-object is lost, and the OST-object FID is missing."
1759 echo "The LFSCK should re-create the MDT-object with new FID under the "
1760 echo "directory .lustre/lost+found/MDTxxxx."
1763 check_mount_and_prep
1764 $LFS mkdir -i 0 $DIR/$tdir/a1
1765 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1767 echo "Inject failure, to simulate the case of missing parent FID"
1768 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1769 do_facet ost1 $LCTL set_param fail_loc=0x1617
1771 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1772 $LFS getstripe $DIR/$tdir/a1/f1
1774 if [ $MDSCOUNT -ge 2 ]; then
1775 $LFS mkdir -i 1 $DIR/$tdir/a2
1776 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a2
1777 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1778 $LFS getstripe $DIR/$tdir/a2/f2
1781 cancel_lru_locks osc
1783 echo "Inject failure, to simulate the case of missing the MDT-object"
1784 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1785 do_facet mds1 $LCTL set_param fail_loc=0x1616
1786 rm -f $DIR/$tdir/a1/f1
1788 if [ $MDSCOUNT -ge 2 ]; then
1789 do_facet mds2 $LCTL set_param fail_loc=0x1616
1790 rm -f $DIR/$tdir/a2/f2
1796 do_facet mds1 $LCTL set_param fail_loc=0
1797 if [ $MDSCOUNT -ge 2 ]; then
1798 do_facet mds2 $LCTL set_param fail_loc=0
1801 cancel_lru_locks mdc
1802 cancel_lru_locks osc
1804 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1805 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1807 for k in $(seq $MDSCOUNT); do
1808 # The LFSCK status query internal is 30 seconds. For the case
1809 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1810 # time to guarantee the status sync up.
1811 wait_update_facet mds${k} "$LCTL get_param -n \
1812 mdd.$(facet_svc mds${k}).lfsck_layout |
1813 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1814 error "(2) MDS${k} is not the expected 'completed'"
1817 for k in $(seq $OSTCOUNT); do
1818 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1819 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1820 awk '/^status/ { print $2 }')
1821 [ "$cur_status" == "completed" ] ||
1822 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1825 if [ $MDSCOUNT -ge 2 ]; then
1831 local repaired=$(do_facet mds1 $LCTL get_param -n \
1832 mdd.$(facet_svc mds1).lfsck_layout |
1833 awk '/^repaired_orphan/ { print $2 }')
1834 [ $repaired -eq $expected ] ||
1835 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1837 if [ $MDSCOUNT -ge 2 ]; then
1838 repaired=$(do_facet mds2 $LCTL get_param -n \
1839 mdd.$(facet_svc mds2).lfsck_layout |
1840 awk '/^repaired_orphan/ { print $2 }')
1841 [ $repaired -eq 0 ] ||
1842 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1845 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1846 ls -ail $MOUNT/.lustre/lost+found/MDT0001/*-N-0 &&
1847 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1849 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1850 ls -ail $MOUNT/.lustre/lost+found/MDT0000/*-N-0 ||
1851 error "(7) .lustre/lost+found/MDT0000/ should not be empty"
1853 run_test 18c "Find out orphan OST-object and repair it (3)"
1857 echo "The target MDT-object layout EA slot is occpuied by some new"
1858 echo "created OST-object when repair dangling reference case. Such"
1859 echo "conflict OST-object has never been modified. Then when found"
1860 echo "the orphan OST-object, LFSCK will replace it with the orphan"
1864 check_mount_and_prep
1866 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1867 echo "guard" > $DIR/$tdir/a1/f1
1868 echo "foo" > $DIR/$tdir/a1/f2
1869 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1870 $LFS path2fid $DIR/$tdir/a1/f1
1871 $LFS getstripe $DIR/$tdir/a1/f1
1872 $LFS path2fid $DIR/$tdir/a1/f2
1873 $LFS getstripe $DIR/$tdir/a1/f2
1874 cancel_lru_locks osc
1876 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
1877 echo "to reference the same OST-object (which is f1's OST-obejct)."
1878 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
1879 echo "dangling reference case, but f2's old OST-object is there."
1882 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
1883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
1884 chown 1.1 $DIR/$tdir/a1/f2
1885 rm -f $DIR/$tdir/a1/f1
1888 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1890 echo "stopall to cleanup object cache"
1893 setupall > /dev/null
1895 echo "The file size should be incorrect since dangling referenced"
1896 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1897 [ "$cur_size" != "$saved_size" ] ||
1898 error "(1) Expect incorrect file2 size"
1900 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1901 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
1903 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1904 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
1906 wait_update_facet mds1 "$LCTL get_param -n \
1907 mdd.$(facet_svc mds1).lfsck_layout |
1908 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
1909 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
1911 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
1913 for k in $(seq $MDSCOUNT); do
1914 # The LFSCK status query internal is 30 seconds. For the case
1915 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1916 # time to guarantee the status sync up.
1917 wait_update_facet mds${k} "$LCTL get_param -n \
1918 mdd.$(facet_svc mds${k}).lfsck_layout |
1919 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1920 error "(3) MDS${k} is not the expected 'completed'"
1923 for k in $(seq $OSTCOUNT); do
1924 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1925 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1926 awk '/^status/ { print $2 }')
1927 [ "$cur_status" == "completed" ] ||
1928 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
1931 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
1932 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
1933 awk '/^repaired_orphan/ { print $2 }')
1934 [ $repaired -eq 1 ] ||
1935 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
1937 echo "The file size should be correct after layout LFSCK scanning"
1938 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1939 [ "$cur_size" == "$saved_size" ] ||
1940 error "(6) Expect file2 size $saved_size, but got $cur_size"
1942 echo "The LFSCK should find back the original data."
1943 cat $DIR/$tdir/a1/f2
1944 $LFS path2fid $DIR/$tdir/a1/f2
1945 $LFS getstripe $DIR/$tdir/a1/f2
1947 run_test 18d "Find out orphan OST-object and repair it (4)"
1951 echo "The target MDT-object layout EA slot is occpuied by some new"
1952 echo "created OST-object when repair dangling reference case. Such"
1953 echo "conflict OST-object has been modified by others. To keep the"
1954 echo "new data, the LFSCK will create a new file to refernece this"
1955 echo "old orphan OST-object."
1958 check_mount_and_prep
1960 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1961 echo "guard" > $DIR/$tdir/a1/f1
1962 echo "foo" > $DIR/$tdir/a1/f2
1963 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1964 $LFS path2fid $DIR/$tdir/a1/f1
1965 $LFS getstripe $DIR/$tdir/a1/f1
1966 $LFS path2fid $DIR/$tdir/a1/f2
1967 $LFS getstripe $DIR/$tdir/a1/f2
1968 cancel_lru_locks osc
1970 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
1971 echo "to reference the same OST-object (which is f1's OST-obejct)."
1972 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
1973 echo "dangling reference case, but f2's old OST-object is there."
1976 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
1977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
1978 chown 1.1 $DIR/$tdir/a1/f2
1979 rm -f $DIR/$tdir/a1/f1
1982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1984 echo "stopall to cleanup object cache"
1987 setupall > /dev/null
1989 echo "The file size should be incorrect since dangling referenced"
1990 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1991 [ "$cur_size" != "$saved_size" ] ||
1992 error "(1) Expect incorrect file2 size"
1994 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1995 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
1997 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1998 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2000 wait_update_facet mds1 "$LCTL get_param -n \
2001 mdd.$(facet_svc mds1).lfsck_layout |
2002 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
2003 error "(3) MDS1 is not the expected 'scanning-phase2'"
2005 # to guarantee all updates are synced.
2009 echo "Write new data to f2 to modify the new created OST-object."
2010 echo "dummy" >> $DIR/$tdir/a1/f2
2012 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2014 for k in $(seq $MDSCOUNT); do
2015 # The LFSCK status query internal is 30 seconds. For the case
2016 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2017 # time to guarantee the status sync up.
2018 wait_update_facet mds${k} "$LCTL get_param -n \
2019 mdd.$(facet_svc mds${k}).lfsck_layout |
2020 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2021 error "(4) MDS${k} is not the expected 'completed'"
2024 for k in $(seq $OSTCOUNT); do
2025 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2026 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2027 awk '/^status/ { print $2 }')
2028 [ "$cur_status" == "completed" ] ||
2029 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2032 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2033 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2034 awk '/^repaired_orphan/ { print $2 }')
2035 [ $repaired -eq 1 ] ||
2036 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2038 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2039 local cname=$(ls $MOUNT/.lustre/lost+found/MDT0000/*-C-0)
2041 error "(7) .lustre/lost+found/MDT0000/ should not be empty"
2043 echo "The stub file should keep the original f2 data"
2044 cur_size=$(ls -il $cname | awk '{ print $6 }')
2045 [ "$cur_size" == "$saved_size" ] ||
2046 error "(8) Expect file2 size $saved_size, but got $cur_size"
2049 $LFS path2fid $cname
2050 $LFS getstripe $cname
2052 echo "The f2 should contains new data."
2053 cat $DIR/$tdir/a1/f2
2054 $LFS path2fid $DIR/$tdir/a1/f2
2055 $LFS getstripe $DIR/$tdir/a1/f2
2057 run_test 18e "Find out orphan OST-object and repair it (5)"
2060 [ $OSTCOUNT -lt 2 ] &&
2061 skip "The test needs at least 2 OSTs" && return
2064 echo "The target MDT-object is lost. The LFSCK should re-create the"
2065 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2066 echo "to verify some OST-object(s) during the first stage-scanning,"
2067 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2068 echo "should not be affected."
2071 check_mount_and_prep
2072 $LFS mkdir -i 0 $DIR/$tdir/a1
2073 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2074 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2075 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2076 $LFS mkdir -i 0 $DIR/$tdir/a2
2077 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a2
2078 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2079 $LFS getstripe $DIR/$tdir/a1/f1
2080 $LFS getstripe $DIR/$tdir/a2/f2
2082 if [ $MDSCOUNT -ge 2 ]; then
2083 $LFS mkdir -i 1 $DIR/$tdir/a3
2084 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a3
2085 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2086 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2087 $LFS mkdir -i 1 $DIR/$tdir/a4
2088 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a4
2089 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2090 $LFS getstripe $DIR/$tdir/a3/f3
2091 $LFS getstripe $DIR/$tdir/a4/f4
2094 cancel_lru_locks osc
2096 echo "Inject failure, to simulate the case of missing the MDT-object"
2097 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2098 do_facet mds1 $LCTL set_param fail_loc=0x1616
2099 rm -f $DIR/$tdir/a1/f1
2100 rm -f $DIR/$tdir/a2/f2
2102 if [ $MDSCOUNT -ge 2 ]; then
2103 do_facet mds2 $LCTL set_param fail_loc=0x1616
2104 rm -f $DIR/$tdir/a3/f3
2105 rm -f $DIR/$tdir/a4/f4
2111 do_facet mds1 $LCTL set_param fail_loc=0
2112 if [ $MDSCOUNT -ge 2 ]; then
2113 do_facet mds2 $LCTL set_param fail_loc=0
2116 cancel_lru_locks mdc
2117 cancel_lru_locks osc
2119 echo "Inject failure, to simulate the OST0 fail to handle"
2120 echo "MDT0 LFSCK request during the first-stage scanning."
2121 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2122 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2124 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2125 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2127 for k in $(seq $MDSCOUNT); do
2128 # The LFSCK status query internal is 30 seconds. For the case
2129 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2130 # time to guarantee the status sync up.
2131 wait_update_facet mds${k} "$LCTL get_param -n \
2132 mdd.$(facet_svc mds${k}).lfsck_layout |
2133 awk '/^status/ { print \\\$2 }'" "partial" 32 ||
2134 error "(2) MDS${k} is not the expected 'partial'"
2137 wait_update_facet ost1 "$LCTL get_param -n \
2138 obdfilter.$(facet_svc ost1).lfsck_layout |
2139 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
2140 error "(3) OST1 is not the expected 'partial'"
2143 wait_update_facet ost2 "$LCTL get_param -n \
2144 obdfilter.$(facet_svc ost2).lfsck_layout |
2145 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2146 error "(4) OST2 is not the expected 'completed'"
2149 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2151 local repaired=$(do_facet mds1 $LCTL get_param -n \
2152 mdd.$(facet_svc mds1).lfsck_layout |
2153 awk '/^repaired_orphan/ { print $2 }')
2154 [ $repaired -eq 1 ] ||
2155 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2157 if [ $MDSCOUNT -ge 2 ]; then
2158 repaired=$(do_facet mds2 $LCTL get_param -n \
2159 mdd.$(facet_svc mds2).lfsck_layout |
2160 awk '/^repaired_orphan/ { print $2 }')
2161 [ $repaired -eq 1 ] ||
2162 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2165 echo "Trigger layout LFSCK on all devices again to cleanup"
2166 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2168 for k in $(seq $MDSCOUNT); do
2169 # The LFSCK status query internal is 30 seconds. For the case
2170 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2171 # time to guarantee the status sync up.
2172 wait_update_facet mds${k} "$LCTL get_param -n \
2173 mdd.$(facet_svc mds${k}).lfsck_layout |
2174 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2175 error "(8) MDS${k} is not the expected 'completed'"
2178 for k in $(seq $OSTCOUNT); do
2179 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2180 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2181 awk '/^status/ { print $2 }')
2182 [ "$cur_status" == "completed" ] ||
2183 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2187 local repaired=$(do_facet mds1 $LCTL get_param -n \
2188 mdd.$(facet_svc mds1).lfsck_layout |
2189 awk '/^repaired_orphan/ { print $2 }')
2190 [ $repaired -eq 2 ] ||
2191 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2193 if [ $MDSCOUNT -ge 2 ]; then
2194 repaired=$(do_facet mds2 $LCTL get_param -n \
2195 mdd.$(facet_svc mds2).lfsck_layout |
2196 awk '/^repaired_orphan/ { print $2 }')
2197 [ $repaired -eq 2 ] ||
2198 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2201 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2204 check_mount_and_prep
2205 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2207 echo "foo" > $DIR/$tdir/a0
2208 echo "guard" > $DIR/$tdir/a1
2209 cancel_lru_locks osc
2211 echo "Inject failure, then client will offer wrong parent FID when read"
2212 do_facet ost1 $LCTL set_param -n \
2213 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2214 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2215 $LCTL set_param fail_loc=0x1619
2217 echo "Read RPC with wrong parent FID should be denied"
2218 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2219 $LCTL set_param fail_loc=0
2221 run_test 19a "OST-object inconsistency self detect"
2224 check_mount_and_prep
2225 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2227 echo "Inject failure stub to make the OST-object to back point to"
2228 echo "non-exist MDT-object"
2230 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2231 do_facet ost1 $LCTL set_param fail_loc=0x1611
2232 echo "foo" > $DIR/$tdir/f0
2233 cancel_lru_locks osc
2234 do_facet ost1 $LCTL set_param fail_loc=0
2236 echo "Nothing should be fixed since self detect and repair is disabled"
2237 local repaired=$(do_facet ost1 $LCTL get_param -n \
2238 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2239 awk '/^repaired/ { print $2 }')
2240 [ $repaired -eq 0 ] ||
2241 error "(1) Expected 0 repaired, but got $repaired"
2243 echo "Read RPC with right parent FID should be accepted,"
2244 echo "and cause parent FID on OST to be fixed"
2246 do_facet ost1 $LCTL set_param -n \
2247 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2248 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2250 repaired=$(do_facet ost1 $LCTL get_param -n \
2251 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2252 awk '/^repaired/ { print $2 }')
2253 [ $repaired -eq 1 ] ||
2254 error "(3) Expected 1 repaired, but got $repaired"
2256 run_test 19b "OST-object inconsistency self repair"
2259 [ $OSTCOUNT -lt 2 ] &&
2260 skip "The test needs at least 2 OSTs" && return
2263 echo "The target MDT-object and some of its OST-object are lost."
2264 echo "The LFSCK should find out the left OST-objects and re-create"
2265 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2266 echo "with the partial OST-objects (LOV EA hole)."
2268 echo "New client can access the file with LOV EA hole via normal"
2269 echo "system tools or commands without crash the system."
2271 echo "For old client, even though it cannot access the file with"
2272 echo "LOV EA hole, it should not cause the system crash."
2275 check_mount_and_prep
2276 $LFS mkdir -i 0 $DIR/$tdir/a1
2277 if [ $OSTCOUNT -gt 2 ]; then
2278 $LFS setstripe -c 3 -i 0 -s 1M $DIR/$tdir/a1
2281 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a1
2285 # 256 blocks on the stripe0.
2286 # 1 block on the stripe1 for 2 OSTs case.
2287 # 256 blocks on the stripe1 for other cases.
2288 # 1 block on the stripe2 if OSTs > 2
2289 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2290 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2291 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2293 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2294 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2295 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2298 $LFS getstripe $DIR/$tdir/a1/f0
2300 $LFS getstripe $DIR/$tdir/a1/f1
2302 $LFS getstripe $DIR/$tdir/a1/f2
2304 if [ $OSTCOUNT -gt 2 ]; then
2305 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2306 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2308 $LFS getstripe $DIR/$tdir/a1/f3
2311 cancel_lru_locks osc
2313 echo "Inject failure..."
2314 echo "To simulate f0 lost MDT-object"
2315 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2316 do_facet mds1 $LCTL set_param fail_loc=0x1616
2317 rm -f $DIR/$tdir/a1/f0
2319 echo "To simulate f1 lost MDT-object and OST-object0"
2320 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2321 do_facet mds1 $LCTL set_param fail_loc=0x161a
2322 rm -f $DIR/$tdir/a1/f1
2324 echo "To simulate f2 lost MDT-object and OST-object1"
2325 do_facet mds1 $LCTL set_param fail_val=1
2326 rm -f $DIR/$tdir/a1/f2
2328 if [ $OSTCOUNT -gt 2 ]; then
2329 echo "To simulate f3 lost MDT-object and OST-object2"
2330 do_facet mds1 $LCTL set_param fail_val=2
2331 rm -f $DIR/$tdir/a1/f3
2334 umount_client $MOUNT
2337 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2339 echo "Inject failure to slow down the LFSCK on OST0"
2340 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2341 do_facet ost1 $LCTL set_param fail_loc=0x161b
2343 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2344 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2347 do_facet ost1 $LCTL set_param fail_loc=0
2349 for k in $(seq $MDSCOUNT); do
2350 # The LFSCK status query internal is 30 seconds. For the case
2351 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2352 # time to guarantee the status sync up.
2353 wait_update_facet mds${k} "$LCTL get_param -n \
2354 mdd.$(facet_svc mds${k}).lfsck_layout |
2355 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2356 error "(2) MDS${k} is not the expected 'completed'"
2359 for k in $(seq $OSTCOUNT); do
2360 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2361 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2362 awk '/^status/ { print $2 }')
2363 [ "$cur_status" == "completed" ] ||
2364 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2367 local repaired=$(do_facet mds1 $LCTL get_param -n \
2368 mdd.$(facet_svc mds1).lfsck_layout |
2369 awk '/^repaired_orphan/ { print $2 }')
2370 if [ $OSTCOUNT -gt 2 ]; then
2371 [ $repaired -eq 9 ] ||
2372 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2374 [ $repaired -eq 4 ] ||
2375 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2378 mount_client $MOUNT || error "(5.0) Fail to start client!"
2380 LOV_PATTERN_F_HOLE=0x40000000
2383 # ${fid0}-R-0 is the old f0
2385 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2386 echo "Check $name, which is the old f0"
2388 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2390 local pattern=0x$($LFS getstripe -L $name)
2391 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2392 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2394 local stripes=$($LFS getstripe -c $name)
2395 if [ $OSTCOUNT -gt 2 ]; then
2396 [ $stripes -eq 3 ] ||
2397 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2399 [ $stripes -eq 2 ] ||
2400 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2403 local size=$(stat $name | awk '/Size:/ { print $2 }')
2404 [ $size -eq $((4096 * $bcount)) ] ||
2405 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2407 cat $name > /dev/null || error "(5.5) cannot read $name"
2409 echo "dummy" >> $name || error "(5.6) cannot write $name"
2411 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2413 touch $name || error "(5.8) cannot touch $name"
2415 rm -f $name || error "(5.9) cannot unlink $name"
2418 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2420 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2421 if [ $OSTCOUNT -gt 2 ]; then
2422 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2424 echo "Check $name, it contains the old f1's stripe1"
2427 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2429 pattern=0x$($LFS getstripe -L $name)
2430 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2431 error "(6.2) expect pattern flag hole, but got $pattern"
2433 stripes=$($LFS getstripe -c $name)
2434 if [ $OSTCOUNT -gt 2 ]; then
2435 [ $stripes -eq 3 ] ||
2436 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2438 [ $stripes -eq 2 ] ||
2439 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2442 size=$(stat $name | awk '/Size:/ { print $2 }')
2443 [ $size -eq $((4096 * $bcount)) ] ||
2444 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2446 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2448 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2449 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2452 [ $failures -eq 256 ] ||
2453 error "(6.6) expect 256 IO failures, but get $failures"
2455 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2456 [ $size -eq $((4096 * $bcount)) ] ||
2457 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2459 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2460 error "(6.8) write to the LOV EA hole should fail"
2462 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2463 error "(6.9) write to normal stripe should NOT fail"
2465 echo "foo" >> $name && error "(6.10) append write $name should fail"
2467 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2469 touch $name || error "(6.12) cannot touch $name"
2471 rm -f $name || error "(6.13) cannot unlink $name"
2474 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2476 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2477 if [ $OSTCOUNT -gt 2 ]; then
2478 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2480 echo "Check $name, it contains the old f2's stripe0"
2483 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2485 pattern=0x$($LFS getstripe -L $name)
2486 stripes=$($LFS getstripe -c $name)
2487 size=$(stat $name | awk '/Size:/ { print $2 }')
2488 if [ $OSTCOUNT -gt 2 ]; then
2489 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2490 error "(7.2.1) expect pattern flag hole, but got $pattern"
2492 [ $stripes -eq 3 ] ||
2493 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2495 [ $size -eq $((4096 * $bcount)) ] ||
2496 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2498 cat $name > /dev/null &&
2499 error "(7.5.1) normal read $name should fail"
2501 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2502 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2504 [ $failures -eq 256 ] ||
2505 error "(7.6) expect 256 IO failures, but get $failures"
2507 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2508 [ $size -eq $((4096 * $bcount)) ] ||
2509 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2511 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2512 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2514 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2515 error "(7.8.1) write to normal stripe should NOT fail"
2517 echo "foo" >> $name &&
2518 error "(7.8.3) append write $name should fail"
2520 chown $RUNAS_ID:$RUNAS_GID $name ||
2521 error "(7.9.1) cannot chown on $name"
2523 touch $name || error "(7.10.1) cannot touch $name"
2525 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2526 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2528 [ $stripes -eq 1 ] ||
2529 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2532 [ $size -eq $((4096 * (256 + 0))) ] ||
2533 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2535 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2537 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2539 chown $RUNAS_ID:$RUNAS_GID $name ||
2540 error "(7.9.2) cannot chown on $name"
2542 touch $name || error "(7.10.2) cannot touch $name"
2545 rm -f $name || error "(7.11) cannot unlink $name"
2547 [ $OSTCOUNT -le 2 ] && return
2550 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2552 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2553 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2555 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2557 pattern=0x$($LFS getstripe -L $name)
2558 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2559 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2561 stripes=$($LFS getstripe -c $name)
2562 # LFSCK does not know the old f3 had 3 stripes.
2563 # It only tries to find as much as possible.
2564 # The stripe count depends on the last stripe's offset.
2565 [ $stripes -eq 2 ] ||
2566 error "(8.3) expect the stripe count is 2, but got $stripes"
2568 size=$(stat $name | awk '/Size:/ { print $2 }')
2570 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2571 error "(8.4) expect the size $((4096 * 512)), but got $size"
2573 cat $name > /dev/null || error "(8.5) cannot read $name"
2575 echo "dummy" >> $name || error "(8.6) cannot write $name"
2577 chown $RUNAS_ID:$RUNAS_GID $name ||
2578 error "(8.7) cannot chown on $name"
2580 touch $name || error "(8.8) cannot touch $name"
2582 rm -f $name || error "(8.9) cannot unlink $name"
2584 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2587 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2588 skip "ignore the test if MDS is older than 2.5.59" && exit 0
2590 check_mount_and_prep
2591 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2593 echo "Start all LFSCK components by default (-s 1)"
2594 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2595 error "Fail to start LFSCK"
2597 echo "namespace LFSCK should be in 'scanning-phase1' status"
2598 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2599 [ "$STATUS" == "scanning-phase1" ] ||
2600 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2602 echo "layout LFSCK should be in 'scanning-phase1' status"
2603 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2604 [ "$STATUS" == "scanning-phase1" ] ||
2605 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2607 echo "Stop all LFSCK components by default"
2608 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2609 error "Fail to stop LFSCK"
2611 run_test 21 "run all LFSCK components by default"
2613 $LCTL set_param debug=-lfsck > /dev/null || true
2615 # restore MDS/OST size
2616 MDSSIZE=${SAVED_MDSSIZE}
2617 OSTSIZE=${SAVED_OSTSIZE}
2618 OSTCOUNT=${SAVED_OSTCOUNT}
2620 # cleanup the system at last