3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
22 SAVED_MDSSIZE=${MDSSIZE}
23 SAVED_OSTSIZE=${OSTSIZE}
24 SAVED_OSTCOUNT=${OSTCOUNT}
25 # use small MDS + OST size to speed formatting time
26 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
29 # no need too much OSTs, to reduce the format/start/stop overhead
30 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
32 # build up a clean test environment.
36 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
37 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
40 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
41 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
43 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
44 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
46 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
47 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 3"
51 $LCTL set_param debug=+lfsck > /dev/null || true
53 MDT_DEV="${FSNAME}-MDT0000"
54 OST_DEV="${FSNAME}-OST0000"
55 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="-o user_xattr"
69 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
78 echo "preparing... $nfiles * $ndirs files will be created $(date)."
79 if [ ! -z $igif ]; then
80 #define OBD_FAIL_FID_IGIF 0x1504
81 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
84 cp $LUSTRE/tests/*.sh $DIR/$tdir/
85 if [ $ndirs -gt 0 ]; then
86 createmany -d $DIR/$tdir/d $ndirs
87 createmany -m $DIR/$tdir/f $ndirs
88 if [ $nfiles -gt 0 ]; then
89 for ((i = 0; i < $ndirs; i++)); do
90 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
91 /dev/null || error "createmany $nfiles"
94 createmany -d $DIR/$tdir/e $ndirs
97 if [ ! -z $igif ]; then
98 touch $DIR/$tdir/dummy
99 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
102 echo "prepared $(date)."
108 #define OBD_FAIL_LFSCK_DELAY1 0x1600
109 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
110 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
112 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
114 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
115 [ "$STATUS" == "scanning-phase1" ] ||
116 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
118 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
120 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
121 [ "$STATUS" == "stopped" ] ||
122 error "(6) Expect 'stopped', but got '$STATUS'"
124 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
126 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
127 [ "$STATUS" == "scanning-phase1" ] ||
128 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
130 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
131 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
132 mdd.${MDT_DEV}.lfsck_namespace |
133 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
135 error "(9) unexpected status"
138 local repaired=$($SHOW_NAMESPACE |
139 awk '/^updated_phase1/ { print $2 }')
140 [ $repaired -eq 0 ] ||
141 error "(10) Expect nothing to be repaired, but got: $repaired"
143 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
144 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
145 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
146 mdd.${MDT_DEV}.lfsck_namespace |
147 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
149 error "(12) unexpected status"
152 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
153 [ $((scanned1 + 1)) -eq $scanned2 ] ||
154 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
156 echo "stopall, should NOT crash LU-3649"
157 stopall || error "(14) Fail to stopall"
159 run_test 0 "Control LFSCK manually"
162 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
163 skip "OI Scrub not implemented for ZFS" && return
167 #define OBD_FAIL_FID_INDIR 0x1501
168 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
169 touch $DIR/$tdir/dummy
171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
173 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
174 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
175 mdd.${MDT_DEV}.lfsck_namespace |
176 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
178 error "(4) unexpected status"
181 local repaired=$($SHOW_NAMESPACE |
182 awk '/^dirent_repaired/ { print $2 }')
183 # for interop with old server
184 [ -z "$repaired" ] &&
185 repaired=$($SHOW_NAMESPACE |
186 awk '/^updated_phase1/ { print $2 }')
188 [ $repaired -eq 1 ] ||
189 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
191 mount_client $MOUNT || error "(6) Fail to start client!"
193 #define OBD_FAIL_FID_LOOKUP 0x1505
194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
195 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
197 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
199 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
203 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
204 skip "OI Scrub not implemented for ZFS" && return
208 #define OBD_FAIL_FID_INLMA 0x1502
209 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
210 touch $DIR/$tdir/dummy
212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
214 #define OBD_FAIL_FID_NOLMA 0x1506
215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
216 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
217 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
218 mdd.${MDT_DEV}.lfsck_namespace |
219 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
221 error "(4) unexpected status"
224 local repaired=$($SHOW_NAMESPACE |
225 awk '/^dirent_repaired/ { print $2 }')
226 # for interop with old server
227 [ -z "$repaired" ] &&
228 repaired=$($SHOW_NAMESPACE |
229 awk '/^updated_phase1/ { print $2 }')
231 [ $repaired -eq 1 ] ||
232 error "(5) Fail to repair missed FID-in-LMA: $repaired"
234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
235 mount_client $MOUNT || error "(6) Fail to start client!"
237 #define OBD_FAIL_FID_LOOKUP 0x1505
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
239 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
243 run_test 1b "LFSCK can find out and repair missed FID-in-LMA"
248 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
249 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
250 touch $DIR/$tdir/dummy
252 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
254 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
255 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
256 mdd.${MDT_DEV}.lfsck_namespace |
257 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
259 error "(4) unexpected status"
262 local repaired=$($SHOW_NAMESPACE |
263 awk '/^linkea_repaired/ { print $2 }')
264 # for interop with old server
265 [ -z "$repaired" ] &&
266 repaired=$($SHOW_NAMESPACE |
267 awk '/^updated_phase2/ { print $2 }')
269 [ $repaired -eq 1 ] ||
270 error "(5) Fail to repair crashed linkEA: $repaired"
272 mount_client $MOUNT || error "(6) Fail to start client!"
274 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
275 error "(7) Fail to stat $DIR/$tdir/dummy"
277 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
278 local dummyname=$($LFS fid2path $DIR $dummyfid)
279 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
280 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
282 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
288 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
290 touch $DIR/$tdir/dummy
292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
294 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
295 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
296 mdd.${MDT_DEV}.lfsck_namespace |
297 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
299 error "(4) unexpected status"
302 local repaired=$($SHOW_NAMESPACE |
303 awk '/^updated_phase2/ { print $2 }')
304 [ $repaired -eq 1 ] ||
305 error "(5) Fail to repair crashed linkEA: $repaired"
307 mount_client $MOUNT || error "(6) Fail to start client!"
309 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
310 error "(7) Fail to stat $DIR/$tdir/dummy"
312 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
313 local dummyname=$($LFS fid2path $DIR $dummyfid)
314 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
315 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
317 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
323 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
325 touch $DIR/$tdir/dummy
327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
329 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
330 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
331 mdd.${MDT_DEV}.lfsck_namespace |
332 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
334 error "(4) unexpected status"
337 local repaired=$($SHOW_NAMESPACE |
338 awk '/^updated_phase2/ { print $2 }')
339 [ $repaired -eq 1 ] ||
340 error "(5) Fail to repair crashed linkEA: $repaired"
342 mount_client $MOUNT || error "(6) Fail to start client!"
344 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
345 error "(7) Fail to stat $DIR/$tdir/dummy"
347 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
348 local dummyname=$($LFS fid2path $DIR $dummyfid)
349 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
350 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
352 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
358 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
359 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
360 touch $DIR/$tdir/dummy
362 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
364 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
365 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
366 mdd.${MDT_DEV}.lfsck_namespace |
367 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
369 error "(4) unexpected status"
372 local repaired=$($SHOW_NAMESPACE |
373 awk '/^linkea_repaired/ { print $2 }')
374 [ $repaired -eq 1 ] ||
375 error "(5) Fail to repair crashed linkEA: $repaired"
377 mount_client $MOUNT || error "(6) Fail to start client!"
379 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
380 error "(7) Fail to stat $DIR/$tdir/dummy"
382 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
383 local dummyname=$($LFS fid2path $DIR $dummyfid)
384 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
385 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
387 run_test 2d "LFSCK can recover the missed linkEA entry"
393 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
394 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
395 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
397 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
398 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
399 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
401 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
402 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
403 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
405 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
406 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
407 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
409 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
411 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
412 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
413 mdd.${MDT_DEV}.lfsck_namespace |
414 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
416 error "(10) unexpected status"
419 local checked=$($SHOW_NAMESPACE |
420 awk '/^checked_phase2/ { print $2 }')
421 [ $checked -ge 4 ] ||
422 error "(11) Fail to check multiple-linked object: $checked"
424 local repaired=$($SHOW_NAMESPACE |
425 awk '/^multiple_linked_repaired/ { print $2 }')
426 [ $repaired -ge 2 ] ||
427 error "(12) Fail to repair multiple-linked object: $repaired"
429 run_test 3 "LFSCK can verify multiple-linked objects"
433 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
434 skip "OI Scrub not implemented for ZFS" && return
437 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
438 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
440 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
441 echo "start $SINGLEMDS with disabling OI scrub"
442 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
443 error "(2) Fail to start MDS!"
445 #define OBD_FAIL_LFSCK_DELAY2 0x1601
446 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
447 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
448 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
449 mdd.${MDT_DEV}.lfsck_namespace |
450 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
452 error "(5) unexpected status"
455 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
456 [ "$STATUS" == "scanning-phase1" ] ||
457 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
459 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
460 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
461 mdd.${MDT_DEV}.lfsck_namespace |
462 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
464 error "(7) unexpected status"
467 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
468 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
470 local repaired=$($SHOW_NAMESPACE |
471 awk '/^dirent_repaired/ { print $2 }')
472 # for interop with old server
473 [ -z "$repaired" ] &&
474 repaired=$($SHOW_NAMESPACE |
475 awk '/^updated_phase1/ { print $2 }')
477 [ $repaired -ge 9 ] ||
478 error "(9) Fail to re-generate FID-in-dirent: $repaired"
480 mount_client $MOUNT || error "(10) Fail to start client!"
482 #define OBD_FAIL_FID_LOOKUP 0x1505
483 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
484 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
485 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
487 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
491 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
492 skip "OI Scrub not implemented for ZFS" && return
495 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
496 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
498 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
499 echo "start $SINGLEMDS with disabling OI scrub"
500 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
501 error "(2) Fail to start MDS!"
503 #define OBD_FAIL_LFSCK_DELAY2 0x1601
504 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
505 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
506 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
507 mdd.${MDT_DEV}.lfsck_namespace |
508 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
510 error "(5) unexpected status"
513 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
514 [ "$STATUS" == "scanning-phase1" ] ||
515 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
517 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
518 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
519 mdd.${MDT_DEV}.lfsck_namespace |
520 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
522 error "(7) unexpected status"
525 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
526 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
528 local repaired=$($SHOW_NAMESPACE |
529 awk '/^dirent_repaired/ { print $2 }')
530 # for interop with old server
531 [ -z "$repaired" ] &&
532 repaired=$($SHOW_NAMESPACE |
533 awk '/^updated_phase1/ { print $2 }')
535 [ $repaired -ge 2 ] ||
536 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
538 mount_client $MOUNT || error "(10) Fail to start client!"
540 #define OBD_FAIL_FID_LOOKUP 0x1505
541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
542 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
544 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
546 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
547 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
548 local dummyname=$($LFS fid2path $DIR $dummyfid)
549 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
550 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
552 run_test 5 "LFSCK can handle IGIF object upgrading"
557 #define OBD_FAIL_LFSCK_DELAY1 0x1600
558 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
559 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
561 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
562 [ "$STATUS" == "scanning-phase1" ] ||
563 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
565 # Sleep 3 sec to guarantee at least one object processed by LFSCK
567 # Fail the LFSCK to guarantee there is at least one checkpoint
568 #define OBD_FAIL_LFSCK_FATAL1 0x1608
569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
570 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
571 mdd.${MDT_DEV}.lfsck_namespace |
572 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
574 error "(4) unexpected status"
577 local POS0=$($SHOW_NAMESPACE |
578 awk '/^last_checkpoint_position/ { print $2 }' |
581 #define OBD_FAIL_LFSCK_DELAY1 0x1600
582 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
583 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
585 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
586 [ "$STATUS" == "scanning-phase1" ] ||
587 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
589 local POS1=$($SHOW_NAMESPACE |
590 awk '/^latest_start_position/ { print $2 }' |
592 [[ $POS0 -lt $POS1 ]] ||
593 error "(7) Expect larger than: $POS0, but got $POS1"
595 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
596 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
597 mdd.${MDT_DEV}.lfsck_namespace |
598 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
600 error "(8) unexpected status"
603 run_test 6a "LFSCK resumes from last checkpoint (1)"
608 #define OBD_FAIL_LFSCK_DELAY2 0x1601
609 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
610 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
612 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
613 [ "$STATUS" == "scanning-phase1" ] ||
614 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
616 # Sleep 5 sec to guarantee that we are in the directory scanning
618 # Fail the LFSCK to guarantee there is at least one checkpoint
619 #define OBD_FAIL_LFSCK_FATAL2 0x1609
620 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
622 mdd.${MDT_DEV}.lfsck_namespace |
623 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
625 error "(4) unexpected status"
628 local O_POS0=$($SHOW_NAMESPACE |
629 awk '/^last_checkpoint_position/ { print $2 }' |
632 local D_POS0=$($SHOW_NAMESPACE |
633 awk '/^last_checkpoint_position/ { print $4 }')
635 #define OBD_FAIL_LFSCK_DELAY2 0x1601
636 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
637 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
639 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
640 [ "$STATUS" == "scanning-phase1" ] ||
641 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
643 local O_POS1=$($SHOW_NAMESPACE |
644 awk '/^latest_start_position/ { print $2 }' |
646 local D_POS1=$($SHOW_NAMESPACE |
647 awk '/^latest_start_position/ { print $4 }')
649 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
650 [[ $O_POS0 -lt $O_POS1 ]] ||
651 error "(7.1) $O_POS1 is not larger than $O_POS0"
653 [[ $D_POS0 -lt $D_POS1 ]] ||
654 error "(7.2) $D_POS1 is not larger than $D_POS0"
657 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
658 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
659 mdd.${MDT_DEV}.lfsck_namespace |
660 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
662 error "(8) unexpected status"
665 run_test 6b "LFSCK resumes from last checkpoint (2)"
672 #define OBD_FAIL_LFSCK_DELAY2 0x1601
673 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
674 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
676 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
677 [ "$STATUS" == "scanning-phase1" ] ||
678 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
680 # Sleep 3 sec to guarantee at least one object processed by LFSCK
682 echo "stop $SINGLEMDS"
683 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
685 echo "start $SINGLEMDS"
686 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
687 error "(5) Fail to start MDS!"
689 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
690 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
691 mdd.${MDT_DEV}.lfsck_namespace |
692 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
694 error "(6) unexpected status"
697 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
703 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
704 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
705 for ((i = 0; i < 20; i++)); do
706 touch $DIR/$tdir/dummy${i}
709 #define OBD_FAIL_LFSCK_DELAY3 0x1602
710 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
711 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
712 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
713 mdd.${MDT_DEV}.lfsck_namespace |
714 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
716 error "(4) unexpected status"
719 echo "stop $SINGLEMDS"
720 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
722 echo "start $SINGLEMDS"
723 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
724 error "(6) Fail to start MDS!"
726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
727 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
728 mdd.${MDT_DEV}.lfsck_namespace |
729 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
731 error "(7) unexpected status"
734 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
739 formatall > /dev/null
745 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
746 [ "$STATUS" == "init" ] ||
747 error "(2) Expect 'init', but got '$STATUS'"
749 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
751 mkdir $DIR/$tdir/crashed
753 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
754 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
755 for ((i = 0; i < 5; i++)); do
756 touch $DIR/$tdir/dummy${i}
759 umount_client $MOUNT || error "(3) Fail to stop client!"
761 #define OBD_FAIL_LFSCK_DELAY2 0x1601
762 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
763 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
765 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
766 [ "$STATUS" == "scanning-phase1" ] ||
767 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
769 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
771 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
772 [ "$STATUS" == "stopped" ] ||
773 error "(7) Expect 'stopped', but got '$STATUS'"
775 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
781 #define OBD_FAIL_LFSCK_FATAL2 0x1609
782 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
783 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
784 mdd.${MDT_DEV}.lfsck_namespace |
785 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
787 error "(10) unexpected status"
790 #define OBD_FAIL_LFSCK_DELAY1 0x1600
791 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
792 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
794 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
798 #define OBD_FAIL_LFSCK_CRASH 0x160a
799 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
802 echo "stop $SINGLEMDS"
803 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
805 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
806 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
808 echo "start $SINGLEMDS"
809 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
810 error "(14) Fail to start MDS!"
812 local timeout=$(max_recovery_time)
815 while [ $timer -lt $timeout ]; do
816 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
817 mdt.${MDT_DEV}.recovery_status |
818 awk '/^status/ { print \\\$2 }'")
819 [ "$STATUS" != "RECOVERING" ] && break;
824 [ $timer != $timeout ] ||
825 error "(14.1) recovery timeout"
827 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
828 [ "$STATUS" == "crashed" ] ||
829 error "(15) Expect 'crashed', but got '$STATUS'"
831 #define OBD_FAIL_LFSCK_DELAY2 0x1601
832 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
833 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
835 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
836 [ "$STATUS" == "scanning-phase1" ] ||
837 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
839 echo "stop $SINGLEMDS"
840 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
842 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
845 echo "start $SINGLEMDS"
846 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
847 error "(19) Fail to start MDS!"
850 while [ $timer -lt $timeout ]; do
851 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
852 mdt.${MDT_DEV}.recovery_status |
853 awk '/^status/ { print \\\$2 }'")
854 [ "$STATUS" != "RECOVERING" ] && break;
859 [ $timer != $timeout ] ||
860 error "(19.1) recovery timeout"
862 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
863 [ "$STATUS" == "paused" ] ||
864 error "(20) Expect 'paused', but got '$STATUS'"
866 #define OBD_FAIL_LFSCK_DELAY3 0x1602
867 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
869 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
871 mdd.${MDT_DEV}.lfsck_namespace |
872 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
874 error "(22) unexpected status"
877 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
878 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
879 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
881 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
882 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
883 mdd.${MDT_DEV}.lfsck_namespace |
884 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
886 error "(24) unexpected status"
889 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
890 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
892 run_test 8 "LFSCK state machine"
895 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
896 skip "Testing on UP system, the speed may be inaccurate."
902 local BASE_SPEED1=100
904 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
907 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
908 [ "$STATUS" == "scanning-phase1" ] ||
909 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
911 local SPEED=$($SHOW_NAMESPACE |
912 awk '/^average_speed_phase1/ { print $2 }')
914 # There may be time error, normally it should be less than 2 seconds.
915 # We allow another 20% schedule error.
917 # MAX_MARGIN = 1.2 = 12 / 10
918 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
919 RUN_TIME1 * 12 / 10))
920 [ $SPEED -lt $MAX_SPEED ] ||
921 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
924 local BASE_SPEED2=300
926 do_facet $SINGLEMDS \
927 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
930 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
931 # MIN_MARGIN = 0.8 = 8 / 10
932 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
933 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
934 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
935 # Account for slow ZFS performance - LU-4934
936 [ $SPEED -gt $MIN_SPEED ] || [ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
937 error "(5) Got speed $SPEED, expected more than $MIN_SPEED"
939 # MAX_MARGIN = 1.2 = 12 / 10
940 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
941 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
942 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
943 [ $SPEED -lt $MAX_SPEED ] ||
944 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
946 do_facet $SINGLEMDS \
947 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
949 wait_update_facet $SINGLEMDS \
950 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
951 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
952 error "(7) Failed to get expected 'completed'"
954 run_test 9a "LFSCK speed control (1)"
957 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
958 skip "Testing on UP system, the speed may be inaccurate."
964 echo "Preparing another 50 * 50 files (with error) at $(date)."
965 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
966 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
967 createmany -d $DIR/$tdir/d 50
968 createmany -m $DIR/$tdir/f 50
969 for ((i = 0; i < 50; i++)); do
970 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
973 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
975 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
976 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
977 mdd.${MDT_DEV}.lfsck_namespace |
978 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
980 error "(5) unexpected status"
983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
984 echo "Prepared at $(date)."
988 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
991 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
992 [ "$STATUS" == "scanning-phase2" ] ||
993 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
995 local SPEED=$($SHOW_NAMESPACE |
996 awk '/^average_speed_phase2/ { print $2 }')
997 # There may be time error, normally it should be less than 2 seconds.
998 # We allow another 20% schedule error.
1000 # MAX_MARGIN = 1.2 = 12 / 10
1001 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1002 RUN_TIME1 * 12 / 10))
1003 [ $SPEED -lt $MAX_SPEED ] ||
1004 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1006 # adjust speed limit
1007 local BASE_SPEED2=150
1009 do_facet $SINGLEMDS \
1010 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1013 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1014 # MIN_MARGIN = 0.8 = 8 / 10
1015 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1016 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1017 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1018 [ $SPEED -gt $MIN_SPEED ] ||[ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
1019 error "(9) Got speed $SPEED, expected more than $MIN_SPEED"
1021 # MAX_MARGIN = 1.2 = 12 / 10
1022 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1023 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1024 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1025 [ $SPEED -lt $MAX_SPEED ] ||
1026 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1028 do_facet $SINGLEMDS \
1029 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1030 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1031 mdd.${MDT_DEV}.lfsck_namespace |
1032 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1034 error "(11) unexpected status"
1037 run_test 9b "LFSCK speed control (2)"
1041 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1042 skip "lookup(..)/linkea on ZFS issue" && return
1046 echo "Preparing more files with error at $(date)."
1047 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1048 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1050 for ((i = 0; i < 1000; i = $((i+2)))); do
1051 mkdir -p $DIR/$tdir/d${i}
1052 touch $DIR/$tdir/f${i}
1053 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1056 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1057 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1059 for ((i = 1; i < 1000; i = $((i+2)))); do
1060 mkdir -p $DIR/$tdir/d${i}
1061 touch $DIR/$tdir/f${i}
1062 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1066 echo "Prepared at $(date)."
1068 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1070 umount_client $MOUNT
1071 mount_client $MOUNT || error "(3) Fail to start client!"
1073 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1076 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1077 [ "$STATUS" == "scanning-phase1" ] ||
1078 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1080 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1082 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1084 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1086 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1088 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1090 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1092 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1094 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1095 error "(14) Fail to softlink!"
1097 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1098 [ "$STATUS" == "scanning-phase1" ] ||
1099 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1101 do_facet $SINGLEMDS \
1102 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1103 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1104 mdd.${MDT_DEV}.lfsck_namespace |
1105 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1107 error "(16) unexpected status"
1110 run_test 10 "System is available during LFSCK scanning"
1113 ost_remove_lastid() {
1116 local rcmd="do_facet ost${ost}"
1118 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1120 # step 1: local mount
1121 mount_fstype ost${ost} || return 1
1122 # step 2: remove the specified LAST_ID
1123 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1125 unmount_fstype ost${ost} || return 2
1129 check_mount_and_prep
1130 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1131 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1136 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1138 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1139 error "(2) Fail to start ost1"
1141 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1142 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1144 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1145 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1147 wait_update_facet ost1 "$LCTL get_param -n \
1148 obdfilter.${OST_DEV}.lfsck_layout |
1149 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1151 error "(5) unexpected status"
1154 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1156 wait_update_facet ost1 "$LCTL get_param -n \
1157 obdfilter.${OST_DEV}.lfsck_layout |
1158 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1160 error "(6) unexpected status"
1163 echo "the LAST_ID(s) should have been rebuilt"
1164 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1165 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1167 run_test 11a "LFSCK can rebuild lost last_id"
1170 check_mount_and_prep
1171 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1173 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1174 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1175 do_facet ost1 $LCTL set_param fail_loc=0x160d
1176 createmany -o $DIR/$tdir/f 64
1177 local lastid1=$(do_facet ost1 "lctl get_param -n \
1178 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1179 awk -F: '{ print $2 }')
1181 umount_client $MOUNT
1182 stop ost1 || error "(1) Fail to stop ost1"
1184 #define OBD_FAIL_OST_ENOSPC 0x215
1185 do_facet ost1 $LCTL set_param fail_loc=0x215
1187 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1188 error "(2) Fail to start ost1"
1190 for ((i = 0; i < 60; i++)); do
1191 lastid2=$(do_facet ost1 "lctl get_param -n \
1192 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1193 awk -F: '{ print $2 }')
1194 [ ! -z $lastid2 ] && break;
1198 echo "the on-disk LAST_ID should be smaller than the expected one"
1199 [ $lastid1 -gt $lastid2 ] ||
1200 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1202 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1203 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1205 wait_update_facet ost1 "$LCTL get_param -n \
1206 obdfilter.${OST_DEV}.lfsck_layout |
1207 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1209 error "(6) unexpected status"
1212 stop ost1 || error "(7) Fail to stop ost1"
1214 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1215 error "(8) Fail to start ost1"
1217 echo "the on-disk LAST_ID should have been rebuilt"
1218 wait_update_facet ost1 "$LCTL get_param -n \
1219 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1220 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1221 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1222 error "(9) expect lastid1 0x100000000:$lastid1"
1225 do_facet ost1 $LCTL set_param fail_loc=0
1226 stopall || error "(10) Fail to stopall"
1228 run_test 11b "LFSCK can rebuild crashed last_id"
1231 [ $MDSCOUNT -lt 2 ] &&
1232 skip "We need at least 2 MDSes for test_12" && exit 0
1234 check_mount_and_prep
1235 for k in $(seq $MDSCOUNT); do
1236 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1237 createmany -o $DIR/$tdir/${k}/f 100 ||
1238 error "(0) Fail to create 100 files."
1241 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1242 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1243 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1245 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1246 for k in $(seq $MDSCOUNT); do
1247 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1248 mdd.$(facet_svc mds${k}).lfsck_namespace |
1249 awk '/^status/ { print $2 }')
1250 [ "$STATUS" == "scanning-phase1" ] ||
1251 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1254 echo "Stop namespace LFSCK on all targets by single lctl command."
1255 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1256 error "(4) Fail to stop LFSCK on all devices!"
1258 echo "All the LFSCK targets should be in 'stopped' status."
1259 for k in $(seq $MDSCOUNT); do
1260 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1261 mdd.$(facet_svc mds${k}).lfsck_namespace |
1262 awk '/^status/ { print $2 }')
1263 [ "$STATUS" == "stopped" ] ||
1264 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1267 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1268 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1269 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1271 echo "All the LFSCK targets should be in 'completed' status."
1272 for k in $(seq $MDSCOUNT); do
1273 wait_update_facet mds${k} "$LCTL get_param -n \
1274 mdd.$(facet_svc mds${k}).lfsck_namespace |
1275 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1276 error "(7) MDS${k} is not the expected 'completed'"
1279 echo "Start layout LFSCK on all targets by single command (-s 1)."
1280 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1281 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1283 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1284 for k in $(seq $MDSCOUNT); do
1285 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1286 mdd.$(facet_svc mds${k}).lfsck_layout |
1287 awk '/^status/ { print $2 }')
1288 [ "$STATUS" == "scanning-phase1" ] ||
1289 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1292 echo "Stop layout LFSCK on all targets by single lctl command."
1293 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1294 error "(10) Fail to stop LFSCK on all devices!"
1296 echo "All the LFSCK targets should be in 'stopped' status."
1297 for k in $(seq $MDSCOUNT); do
1298 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1299 mdd.$(facet_svc mds${k}).lfsck_layout |
1300 awk '/^status/ { print $2 }')
1301 [ "$STATUS" == "stopped" ] ||
1302 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1305 for k in $(seq $OSTCOUNT); do
1306 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1307 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1308 awk '/^status/ { print $2 }')
1309 [ "$STATUS" == "stopped" ] ||
1310 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1313 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1314 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1315 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1317 echo "All the LFSCK targets should be in 'completed' status."
1318 for k in $(seq $MDSCOUNT); do
1319 # The LFSCK status query internal is 30 seconds. For the case
1320 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1321 # time to guarantee the status sync up.
1322 wait_update_facet mds${k} "$LCTL get_param -n \
1323 mdd.$(facet_svc mds${k}).lfsck_layout |
1324 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1325 error "(14) MDS${k} is not the expected 'completed'"
1328 run_test 12 "single command to trigger LFSCK on all devices"
1332 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1333 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1334 echo "MDT-object FID."
1337 check_mount_and_prep
1339 echo "Inject failure stub to simulate bad lmm_oi"
1340 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1342 createmany -o $DIR/$tdir/f 32
1343 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1345 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1346 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1348 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1349 mdd.${MDT_DEV}.lfsck_layout |
1350 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1352 error "(2) unexpected status"
1355 local repaired=$($SHOW_LAYOUT |
1356 awk '/^repaired_others/ { print $2 }')
1357 [ $repaired -eq 32 ] ||
1358 error "(3) Fail to repair crashed lmm_oi: $repaired"
1360 run_test 13 "LFSCK can repair crashed lmm_oi"
1364 echo "The OST-object referenced by the MDT-object should be there;"
1365 echo "otherwise, the LFSCK should re-create the missed OST-object."
1368 check_mount_and_prep
1369 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1371 local count=$(precreated_ost_obj_count 0 0)
1373 echo "Inject failure stub to simulate dangling referenced MDT-object"
1374 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1375 do_facet ost1 $LCTL set_param fail_loc=0x1610
1376 createmany -o $DIR/$tdir/f $((count + 31))
1377 touch $DIR/$tdir/guard
1378 do_facet ost1 $LCTL set_param fail_loc=0
1380 start_full_debug_logging
1382 # exhaust other pre-created dangling cases
1383 count=$(precreated_ost_obj_count 0 0)
1384 createmany -o $DIR/$tdir/a $count ||
1385 error "(0) Fail to create $count files."
1387 echo "'ls' should fail because of dangling referenced MDT-object"
1388 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1390 echo "Trigger layout LFSCK to find out dangling reference"
1391 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1393 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1394 mdd.${MDT_DEV}.lfsck_layout |
1395 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1397 error "(3) unexpected status"
1400 local repaired=$($SHOW_LAYOUT |
1401 awk '/^repaired_dangling/ { print $2 }')
1402 [ $repaired -ge 32 ] ||
1403 error "(4) Fail to repair dangling reference: $repaired"
1405 echo "'stat' should fail because of not repair dangling by default"
1406 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1408 echo "Trigger layout LFSCK to repair dangling reference"
1409 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1412 mdd.${MDT_DEV}.lfsck_layout |
1413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1415 error "(7) unexpected status"
1418 # There may be some async LFSCK updates in processing, wait for
1419 # a while until the target reparation has been done. LU-4970.
1421 echo "'stat' should success after layout LFSCK repairing"
1422 wait_update_facet client "stat $DIR/$tdir/guard |
1423 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1424 stat $DIR/$tdir/guard
1426 error "(8) unexpected size"
1429 repaired=$($SHOW_LAYOUT |
1430 awk '/^repaired_dangling/ { print $2 }')
1431 [ $repaired -ge 32 ] ||
1432 error "(9) Fail to repair dangling reference: $repaired"
1434 stop_full_debug_logging
1436 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1440 echo "If the OST-object referenced by the MDT-object back points"
1441 echo "to some non-exist MDT-object, then the LFSCK should repair"
1442 echo "the OST-object to back point to the right MDT-object."
1445 check_mount_and_prep
1446 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1448 echo "Inject failure stub to make the OST-object to back point to"
1449 echo "non-exist MDT-object."
1450 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1452 do_facet ost1 $LCTL set_param fail_loc=0x1611
1453 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1454 cancel_lru_locks osc
1455 do_facet ost1 $LCTL set_param fail_loc=0
1457 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1458 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1460 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1461 mdd.${MDT_DEV}.lfsck_layout |
1462 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1464 error "(2) unexpected status"
1467 local repaired=$($SHOW_LAYOUT |
1468 awk '/^repaired_unmatched_pair/ { print $2 }')
1469 [ $repaired -eq 1 ] ||
1470 error "(3) Fail to repair unmatched pair: $repaired"
1472 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1476 echo "If the OST-object referenced by the MDT-object back points"
1477 echo "to other MDT-object that doesn't recognize the OST-object,"
1478 echo "then the LFSCK should repair it to back point to the right"
1479 echo "MDT-object (the first one)."
1482 check_mount_and_prep
1483 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1484 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1485 cancel_lru_locks osc
1487 echo "Inject failure stub to make the OST-object to back point to"
1488 echo "other MDT-object"
1490 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1491 do_facet ost1 $LCTL set_param fail_loc=0x1612
1492 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1493 cancel_lru_locks osc
1494 do_facet ost1 $LCTL set_param fail_loc=0
1496 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1497 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1499 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1500 mdd.${MDT_DEV}.lfsck_layout |
1501 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1503 error "(2) unexpected status"
1506 local repaired=$($SHOW_LAYOUT |
1507 awk '/^repaired_unmatched_pair/ { print $2 }')
1508 [ $repaired -eq 1 ] ||
1509 error "(3) Fail to repair unmatched pair: $repaired"
1511 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1515 echo "If the OST-object's owner information does not match the owner"
1516 echo "information stored in the MDT-object, then the LFSCK trust the"
1517 echo "MDT-object and update the OST-object's owner information."
1520 check_mount_and_prep
1521 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1522 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1523 cancel_lru_locks osc
1525 echo "Inject failure stub to skip OST-object owner changing"
1526 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1527 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1528 chown 1.1 $DIR/$tdir/f0
1529 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1531 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1534 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1537 mdd.${MDT_DEV}.lfsck_layout |
1538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1540 error "(2) unexpected status"
1543 local repaired=$($SHOW_LAYOUT |
1544 awk '/^repaired_inconsistent_owner/ { print $2 }')
1545 [ $repaired -eq 1 ] ||
1546 error "(3) Fail to repair inconsistent owner: $repaired"
1548 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1552 echo "If more than one MDT-objects reference the same OST-object,"
1553 echo "and the OST-object only recognizes one MDT-object, then the"
1554 echo "LFSCK should create new OST-objects for such non-recognized"
1558 check_mount_and_prep
1559 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1561 echo "Inject failure stub to make two MDT-objects to refernce"
1562 echo "the OST-object"
1564 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1565 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1567 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1568 cancel_lru_locks osc
1570 createmany -o $DIR/$tdir/f 1
1572 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1574 cancel_lru_locks mdc
1575 cancel_lru_locks osc
1577 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1578 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1579 [ $size -eq 1048576 ] ||
1580 error "(1) f0 (wrong) size should be 1048576, but got $size"
1582 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1585 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1587 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1588 mdd.${MDT_DEV}.lfsck_layout |
1589 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1591 error "(3) unexpected status"
1594 local repaired=$($SHOW_LAYOUT |
1595 awk '/^repaired_multiple_referenced/ { print $2 }')
1596 [ $repaired -eq 1 ] ||
1597 error "(4) Fail to repair multiple references: $repaired"
1599 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1600 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1601 error "(5) Fail to write f0."
1602 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1603 [ $size -eq 1048576 ] ||
1604 error "(6) guard size should be 1048576, but got $size"
1606 run_test 17 "LFSCK can repair multiple references"
1610 echo "The target MDT-object is there, but related stripe information"
1611 echo "is lost or partly lost. The LFSCK should regenerate the missed"
1612 echo "layout EA entries."
1615 check_mount_and_prep
1616 $LFS mkdir -i 0 $DIR/$tdir/a1
1617 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1618 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1620 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1622 $LFS path2fid $DIR/$tdir/a1/f1
1623 $LFS getstripe $DIR/$tdir/a1/f1
1625 if [ $MDSCOUNT -ge 2 ]; then
1626 $LFS mkdir -i 1 $DIR/$tdir/a2
1627 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1628 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1629 $LFS path2fid $DIR/$tdir/a2/f2
1630 $LFS getstripe $DIR/$tdir/a2/f2
1633 cancel_lru_locks osc
1635 echo "Inject failure, to make the MDT-object lost its layout EA"
1636 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1637 do_facet mds1 $LCTL set_param fail_loc=0x1615
1638 chown 1.1 $DIR/$tdir/a1/f1
1640 if [ $MDSCOUNT -ge 2 ]; then
1641 do_facet mds2 $LCTL set_param fail_loc=0x1615
1642 chown 1.1 $DIR/$tdir/a2/f2
1648 do_facet mds1 $LCTL set_param fail_loc=0
1649 if [ $MDSCOUNT -ge 2 ]; then
1650 do_facet mds2 $LCTL set_param fail_loc=0
1653 cancel_lru_locks mdc
1654 cancel_lru_locks osc
1656 echo "The file size should be incorrect since layout EA is lost"
1657 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1658 [ "$cur_size" != "$saved_size" ] ||
1659 error "(1) Expect incorrect file1 size"
1661 if [ $MDSCOUNT -ge 2 ]; then
1662 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1663 [ "$cur_size" != "$saved_size" ] ||
1664 error "(2) Expect incorrect file2 size"
1667 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1668 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1670 for k in $(seq $MDSCOUNT); do
1671 # The LFSCK status query internal is 30 seconds. For the case
1672 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1673 # time to guarantee the status sync up.
1674 wait_update_facet mds${k} "$LCTL get_param -n \
1675 mdd.$(facet_svc mds${k}).lfsck_layout |
1676 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1677 error "(4) MDS${k} is not the expected 'completed'"
1680 for k in $(seq $OSTCOUNT); do
1681 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1682 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1683 awk '/^status/ { print $2 }')
1684 [ "$cur_status" == "completed" ] ||
1685 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1688 local repaired=$(do_facet mds1 $LCTL get_param -n \
1689 mdd.$(facet_svc mds1).lfsck_layout |
1690 awk '/^repaired_orphan/ { print $2 }')
1691 [ $repaired -eq 1 ] ||
1692 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1694 if [ $MDSCOUNT -ge 2 ]; then
1695 repaired=$(do_facet mds2 $LCTL get_param -n \
1696 mdd.$(facet_svc mds2).lfsck_layout |
1697 awk '/^repaired_orphan/ { print $2 }')
1698 [ $repaired -eq 2 ] ||
1699 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1702 $LFS path2fid $DIR/$tdir/a1/f1
1703 $LFS getstripe $DIR/$tdir/a1/f1
1705 if [ $MDSCOUNT -ge 2 ]; then
1706 $LFS path2fid $DIR/$tdir/a2/f2
1707 $LFS getstripe $DIR/$tdir/a2/f2
1710 echo "The file size should be correct after layout LFSCK scanning"
1711 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1712 [ "$cur_size" == "$saved_size" ] ||
1713 error "(7) Expect file1 size $saved_size, but got $cur_size"
1715 if [ $MDSCOUNT -ge 2 ]; then
1716 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1717 [ "$cur_size" == "$saved_size" ] ||
1718 error "(8) Expect file2 size $saved_size, but got $cur_size"
1721 run_test 18a "Find out orphan OST-object and repair it (1)"
1725 echo "The target MDT-object is lost. The LFSCK should re-create the"
1726 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1727 echo "can move it back to normal namespace manually."
1730 check_mount_and_prep
1731 $LFS mkdir -i 0 $DIR/$tdir/a1
1732 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1733 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1734 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1735 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1737 $LFS getstripe $DIR/$tdir/a1/f1
1739 if [ $MDSCOUNT -ge 2 ]; then
1740 $LFS mkdir -i 1 $DIR/$tdir/a2
1741 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1742 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1743 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1745 $LFS getstripe $DIR/$tdir/a2/f2
1748 cancel_lru_locks osc
1750 echo "Inject failure, to simulate the case of missing the MDT-object"
1751 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1752 do_facet mds1 $LCTL set_param fail_loc=0x1616
1753 rm -f $DIR/$tdir/a1/f1
1755 if [ $MDSCOUNT -ge 2 ]; then
1756 do_facet mds2 $LCTL set_param fail_loc=0x1616
1757 rm -f $DIR/$tdir/a2/f2
1763 do_facet mds1 $LCTL set_param fail_loc=0
1764 if [ $MDSCOUNT -ge 2 ]; then
1765 do_facet mds2 $LCTL set_param fail_loc=0
1768 cancel_lru_locks mdc
1769 cancel_lru_locks osc
1771 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1772 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1774 for k in $(seq $MDSCOUNT); do
1775 # The LFSCK status query internal is 30 seconds. For the case
1776 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1777 # time to guarantee the status sync up.
1778 wait_update_facet mds${k} "$LCTL get_param -n \
1779 mdd.$(facet_svc mds${k}).lfsck_layout |
1780 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1781 error "(2) MDS${k} is not the expected 'completed'"
1784 for k in $(seq $OSTCOUNT); do
1785 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1786 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1787 awk '/^status/ { print $2 }')
1788 [ "$cur_status" == "completed" ] ||
1789 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1792 local repaired=$(do_facet mds1 $LCTL get_param -n \
1793 mdd.$(facet_svc mds1).lfsck_layout |
1794 awk '/^repaired_orphan/ { print $2 }')
1795 [ $repaired -eq 1 ] ||
1796 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1798 if [ $MDSCOUNT -ge 2 ]; then
1799 repaired=$(do_facet mds2 $LCTL get_param -n \
1800 mdd.$(facet_svc mds2).lfsck_layout |
1801 awk '/^repaired_orphan/ { print $2 }')
1802 [ $repaired -eq 2 ] ||
1803 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1806 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1807 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1808 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1810 if [ $MDSCOUNT -ge 2 ]; then
1811 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1812 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1815 $LFS path2fid $DIR/$tdir/a1/f1
1816 $LFS getstripe $DIR/$tdir/a1/f1
1818 if [ $MDSCOUNT -ge 2 ]; then
1819 $LFS path2fid $DIR/$tdir/a2/f2
1820 $LFS getstripe $DIR/$tdir/a2/f2
1823 echo "The file size should be correct after layout LFSCK scanning"
1824 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1825 [ "$cur_size" == "$saved_size" ] ||
1826 error "(7) Expect file1 size $saved_size, but got $cur_size"
1828 if [ $MDSCOUNT -ge 2 ]; then
1829 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1830 [ "$cur_size" == "$saved_size" ] ||
1831 error "(8) Expect file2 size $saved_size, but got $cur_size"
1834 run_test 18b "Find out orphan OST-object and repair it (2)"
1838 echo "The target MDT-object is lost, and the OST-object FID is missing."
1839 echo "The LFSCK should re-create the MDT-object with new FID under the "
1840 echo "directory .lustre/lost+found/MDTxxxx."
1843 check_mount_and_prep
1844 $LFS mkdir -i 0 $DIR/$tdir/a1
1845 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1847 echo "Inject failure, to simulate the case of missing parent FID"
1848 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1849 do_facet ost1 $LCTL set_param fail_loc=0x1617
1851 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1852 $LFS getstripe $DIR/$tdir/a1/f1
1854 if [ $MDSCOUNT -ge 2 ]; then
1855 $LFS mkdir -i 1 $DIR/$tdir/a2
1856 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a2
1857 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1858 $LFS getstripe $DIR/$tdir/a2/f2
1861 cancel_lru_locks osc
1863 echo "Inject failure, to simulate the case of missing the MDT-object"
1864 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1865 do_facet mds1 $LCTL set_param fail_loc=0x1616
1866 rm -f $DIR/$tdir/a1/f1
1868 if [ $MDSCOUNT -ge 2 ]; then
1869 do_facet mds2 $LCTL set_param fail_loc=0x1616
1870 rm -f $DIR/$tdir/a2/f2
1876 do_facet mds1 $LCTL set_param fail_loc=0
1877 if [ $MDSCOUNT -ge 2 ]; then
1878 do_facet mds2 $LCTL set_param fail_loc=0
1881 cancel_lru_locks mdc
1882 cancel_lru_locks osc
1884 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1885 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1887 for k in $(seq $MDSCOUNT); do
1888 # The LFSCK status query internal is 30 seconds. For the case
1889 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1890 # time to guarantee the status sync up.
1891 wait_update_facet mds${k} "$LCTL get_param -n \
1892 mdd.$(facet_svc mds${k}).lfsck_layout |
1893 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1894 error "(2) MDS${k} is not the expected 'completed'"
1897 for k in $(seq $OSTCOUNT); do
1898 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1899 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1900 awk '/^status/ { print $2 }')
1901 [ "$cur_status" == "completed" ] ||
1902 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1905 if [ $MDSCOUNT -ge 2 ]; then
1911 local repaired=$(do_facet mds1 $LCTL get_param -n \
1912 mdd.$(facet_svc mds1).lfsck_layout |
1913 awk '/^repaired_orphan/ { print $2 }')
1914 [ $repaired -eq $expected ] ||
1915 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1917 if [ $MDSCOUNT -ge 2 ]; then
1918 repaired=$(do_facet mds2 $LCTL get_param -n \
1919 mdd.$(facet_svc mds2).lfsck_layout |
1920 awk '/^repaired_orphan/ { print $2 }')
1921 [ $repaired -eq 0 ] ||
1922 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1925 ls -ail $MOUNT/.lustre/lost+found/
1927 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1928 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
1929 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
1931 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1934 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1935 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
1936 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
1938 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
1939 [ ! -z "$cname" ] ||
1940 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
1942 run_test 18c "Find out orphan OST-object and repair it (3)"
1946 echo "The target MDT-object layout EA slot is occpuied by some new"
1947 echo "created OST-object when repair dangling reference case. Such"
1948 echo "conflict OST-object has never been modified. Then when found"
1949 echo "the orphan OST-object, LFSCK will replace it with the orphan"
1953 check_mount_and_prep
1955 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1956 echo "guard" > $DIR/$tdir/a1/f1
1957 echo "foo" > $DIR/$tdir/a1/f2
1958 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1959 $LFS path2fid $DIR/$tdir/a1/f1
1960 $LFS getstripe $DIR/$tdir/a1/f1
1961 $LFS path2fid $DIR/$tdir/a1/f2
1962 $LFS getstripe $DIR/$tdir/a1/f2
1963 cancel_lru_locks osc
1965 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
1966 echo "to reference the same OST-object (which is f1's OST-obejct)."
1967 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
1968 echo "dangling reference case, but f2's old OST-object is there."
1971 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
1972 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
1973 chown 1.1 $DIR/$tdir/a1/f2
1974 rm -f $DIR/$tdir/a1/f1
1977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1979 echo "stopall to cleanup object cache"
1982 setupall > /dev/null
1984 echo "The file size should be incorrect since dangling referenced"
1985 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1986 [ "$cur_size" != "$saved_size" ] ||
1987 error "(1) Expect incorrect file2 size"
1989 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1990 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
1992 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1993 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
1995 wait_update_facet mds1 "$LCTL get_param -n \
1996 mdd.$(facet_svc mds1).lfsck_layout |
1997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
1998 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2000 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2002 for k in $(seq $MDSCOUNT); do
2003 # The LFSCK status query internal is 30 seconds. For the case
2004 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2005 # time to guarantee the status sync up.
2006 wait_update_facet mds${k} "$LCTL get_param -n \
2007 mdd.$(facet_svc mds${k}).lfsck_layout |
2008 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2009 error "(3) MDS${k} is not the expected 'completed'"
2012 for k in $(seq $OSTCOUNT); do
2013 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2014 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2015 awk '/^status/ { print $2 }')
2016 [ "$cur_status" == "completed" ] ||
2017 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2020 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2021 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2022 awk '/^repaired_orphan/ { print $2 }')
2023 [ $repaired -eq 1 ] ||
2024 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2026 echo "The file size should be correct after layout LFSCK scanning"
2027 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2028 [ "$cur_size" == "$saved_size" ] ||
2029 error "(6) Expect file2 size $saved_size, but got $cur_size"
2031 echo "The LFSCK should find back the original data."
2032 cat $DIR/$tdir/a1/f2
2033 $LFS path2fid $DIR/$tdir/a1/f2
2034 $LFS getstripe $DIR/$tdir/a1/f2
2036 run_test 18d "Find out orphan OST-object and repair it (4)"
2040 echo "The target MDT-object layout EA slot is occpuied by some new"
2041 echo "created OST-object when repair dangling reference case. Such"
2042 echo "conflict OST-object has been modified by others. To keep the"
2043 echo "new data, the LFSCK will create a new file to refernece this"
2044 echo "old orphan OST-object."
2047 check_mount_and_prep
2049 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2050 echo "guard" > $DIR/$tdir/a1/f1
2051 echo "foo" > $DIR/$tdir/a1/f2
2052 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2053 $LFS path2fid $DIR/$tdir/a1/f1
2054 $LFS getstripe $DIR/$tdir/a1/f1
2055 $LFS path2fid $DIR/$tdir/a1/f2
2056 $LFS getstripe $DIR/$tdir/a1/f2
2057 cancel_lru_locks osc
2059 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2060 echo "to reference the same OST-object (which is f1's OST-obejct)."
2061 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2062 echo "dangling reference case, but f2's old OST-object is there."
2065 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2067 chown 1.1 $DIR/$tdir/a1/f2
2068 rm -f $DIR/$tdir/a1/f1
2071 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2073 echo "stopall to cleanup object cache"
2076 setupall > /dev/null
2078 echo "The file size should be incorrect since dangling referenced"
2079 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2080 [ "$cur_size" != "$saved_size" ] ||
2081 error "(1) Expect incorrect file2 size"
2083 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2084 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2086 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2087 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2089 wait_update_facet mds1 "$LCTL get_param -n \
2090 mdd.$(facet_svc mds1).lfsck_layout |
2091 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
2092 error "(3) MDS1 is not the expected 'scanning-phase2'"
2094 # to guarantee all updates are synced.
2098 echo "Write new data to f2 to modify the new created OST-object."
2099 echo "dummy" >> $DIR/$tdir/a1/f2
2101 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2103 for k in $(seq $MDSCOUNT); do
2104 # The LFSCK status query internal is 30 seconds. For the case
2105 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2106 # time to guarantee the status sync up.
2107 wait_update_facet mds${k} "$LCTL get_param -n \
2108 mdd.$(facet_svc mds${k}).lfsck_layout |
2109 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2110 error "(4) MDS${k} is not the expected 'completed'"
2113 for k in $(seq $OSTCOUNT); do
2114 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2115 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2116 awk '/^status/ { print $2 }')
2117 [ "$cur_status" == "completed" ] ||
2118 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2121 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2122 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2123 awk '/^repaired_orphan/ { print $2 }')
2124 [ $repaired -eq 1 ] ||
2125 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2127 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2128 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2129 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2131 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2132 [ ! -z "$cname" ] ||
2133 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2135 echo "The stub file should keep the original f2 data"
2136 cur_size=$(ls -il $cname | awk '{ print $6 }')
2137 [ "$cur_size" == "$saved_size" ] ||
2138 error "(9) Expect file2 size $saved_size, but got $cur_size"
2141 $LFS path2fid $cname
2142 $LFS getstripe $cname
2144 echo "The f2 should contains new data."
2145 cat $DIR/$tdir/a1/f2
2146 $LFS path2fid $DIR/$tdir/a1/f2
2147 $LFS getstripe $DIR/$tdir/a1/f2
2149 run_test 18e "Find out orphan OST-object and repair it (5)"
2152 [ $OSTCOUNT -lt 2 ] &&
2153 skip "The test needs at least 2 OSTs" && return
2156 echo "The target MDT-object is lost. The LFSCK should re-create the"
2157 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2158 echo "to verify some OST-object(s) during the first stage-scanning,"
2159 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2160 echo "should not be affected."
2163 check_mount_and_prep
2164 $LFS mkdir -i 0 $DIR/$tdir/a1
2165 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2166 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2167 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2168 $LFS mkdir -i 0 $DIR/$tdir/a2
2169 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a2
2170 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2171 $LFS getstripe $DIR/$tdir/a1/f1
2172 $LFS getstripe $DIR/$tdir/a2/f2
2174 if [ $MDSCOUNT -ge 2 ]; then
2175 $LFS mkdir -i 1 $DIR/$tdir/a3
2176 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a3
2177 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2178 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2179 $LFS mkdir -i 1 $DIR/$tdir/a4
2180 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a4
2181 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2182 $LFS getstripe $DIR/$tdir/a3/f3
2183 $LFS getstripe $DIR/$tdir/a4/f4
2186 cancel_lru_locks osc
2188 echo "Inject failure, to simulate the case of missing the MDT-object"
2189 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2190 do_facet mds1 $LCTL set_param fail_loc=0x1616
2191 rm -f $DIR/$tdir/a1/f1
2192 rm -f $DIR/$tdir/a2/f2
2194 if [ $MDSCOUNT -ge 2 ]; then
2195 do_facet mds2 $LCTL set_param fail_loc=0x1616
2196 rm -f $DIR/$tdir/a3/f3
2197 rm -f $DIR/$tdir/a4/f4
2203 do_facet mds1 $LCTL set_param fail_loc=0
2204 if [ $MDSCOUNT -ge 2 ]; then
2205 do_facet mds2 $LCTL set_param fail_loc=0
2208 cancel_lru_locks mdc
2209 cancel_lru_locks osc
2211 echo "Inject failure, to simulate the OST0 fail to handle"
2212 echo "MDT0 LFSCK request during the first-stage scanning."
2213 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2214 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2216 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2217 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2219 for k in $(seq $MDSCOUNT); do
2220 # The LFSCK status query internal is 30 seconds. For the case
2221 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2222 # time to guarantee the status sync up.
2223 wait_update_facet mds${k} "$LCTL get_param -n \
2224 mdd.$(facet_svc mds${k}).lfsck_layout |
2225 awk '/^status/ { print \\\$2 }'" "partial" 32 ||
2226 error "(2) MDS${k} is not the expected 'partial'"
2229 wait_update_facet ost1 "$LCTL get_param -n \
2230 obdfilter.$(facet_svc ost1).lfsck_layout |
2231 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
2232 error "(3) OST1 is not the expected 'partial'"
2235 wait_update_facet ost2 "$LCTL get_param -n \
2236 obdfilter.$(facet_svc ost2).lfsck_layout |
2237 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2238 error "(4) OST2 is not the expected 'completed'"
2241 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2243 local repaired=$(do_facet mds1 $LCTL get_param -n \
2244 mdd.$(facet_svc mds1).lfsck_layout |
2245 awk '/^repaired_orphan/ { print $2 }')
2246 [ $repaired -eq 1 ] ||
2247 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2249 if [ $MDSCOUNT -ge 2 ]; then
2250 repaired=$(do_facet mds2 $LCTL get_param -n \
2251 mdd.$(facet_svc mds2).lfsck_layout |
2252 awk '/^repaired_orphan/ { print $2 }')
2253 [ $repaired -eq 1 ] ||
2254 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2257 echo "Trigger layout LFSCK on all devices again to cleanup"
2258 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2260 for k in $(seq $MDSCOUNT); do
2261 # The LFSCK status query internal is 30 seconds. For the case
2262 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2263 # time to guarantee the status sync up.
2264 wait_update_facet mds${k} "$LCTL get_param -n \
2265 mdd.$(facet_svc mds${k}).lfsck_layout |
2266 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2267 error "(8) MDS${k} is not the expected 'completed'"
2270 for k in $(seq $OSTCOUNT); do
2271 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2272 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2273 awk '/^status/ { print $2 }')
2274 [ "$cur_status" == "completed" ] ||
2275 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2279 local repaired=$(do_facet mds1 $LCTL get_param -n \
2280 mdd.$(facet_svc mds1).lfsck_layout |
2281 awk '/^repaired_orphan/ { print $2 }')
2282 [ $repaired -eq 2 ] ||
2283 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2285 if [ $MDSCOUNT -ge 2 ]; then
2286 repaired=$(do_facet mds2 $LCTL get_param -n \
2287 mdd.$(facet_svc mds2).lfsck_layout |
2288 awk '/^repaired_orphan/ { print $2 }')
2289 [ $repaired -eq 2 ] ||
2290 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2293 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2296 check_mount_and_prep
2297 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2299 echo "foo" > $DIR/$tdir/a0
2300 echo "guard" > $DIR/$tdir/a1
2301 cancel_lru_locks osc
2303 echo "Inject failure, then client will offer wrong parent FID when read"
2304 do_facet ost1 $LCTL set_param -n \
2305 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2306 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2307 $LCTL set_param fail_loc=0x1619
2309 echo "Read RPC with wrong parent FID should be denied"
2310 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2311 $LCTL set_param fail_loc=0
2313 run_test 19a "OST-object inconsistency self detect"
2316 check_mount_and_prep
2317 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2319 echo "Inject failure stub to make the OST-object to back point to"
2320 echo "non-exist MDT-object"
2322 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2323 do_facet ost1 $LCTL set_param fail_loc=0x1611
2324 echo "foo" > $DIR/$tdir/f0
2325 cancel_lru_locks osc
2326 do_facet ost1 $LCTL set_param fail_loc=0
2328 echo "Nothing should be fixed since self detect and repair is disabled"
2329 local repaired=$(do_facet ost1 $LCTL get_param -n \
2330 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2331 awk '/^repaired/ { print $2 }')
2332 [ $repaired -eq 0 ] ||
2333 error "(1) Expected 0 repaired, but got $repaired"
2335 echo "Read RPC with right parent FID should be accepted,"
2336 echo "and cause parent FID on OST to be fixed"
2338 do_facet ost1 $LCTL set_param -n \
2339 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2340 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2342 repaired=$(do_facet ost1 $LCTL get_param -n \
2343 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2344 awk '/^repaired/ { print $2 }')
2345 [ $repaired -eq 1 ] ||
2346 error "(3) Expected 1 repaired, but got $repaired"
2348 run_test 19b "OST-object inconsistency self repair"
2351 [ $OSTCOUNT -lt 2 ] &&
2352 skip "The test needs at least 2 OSTs" && return
2355 echo "The target MDT-object and some of its OST-object are lost."
2356 echo "The LFSCK should find out the left OST-objects and re-create"
2357 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2358 echo "with the partial OST-objects (LOV EA hole)."
2360 echo "New client can access the file with LOV EA hole via normal"
2361 echo "system tools or commands without crash the system."
2363 echo "For old client, even though it cannot access the file with"
2364 echo "LOV EA hole, it should not cause the system crash."
2367 check_mount_and_prep
2368 $LFS mkdir -i 0 $DIR/$tdir/a1
2369 if [ $OSTCOUNT -gt 2 ]; then
2370 $LFS setstripe -c 3 -i 0 -s 1M $DIR/$tdir/a1
2373 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a1
2377 # 256 blocks on the stripe0.
2378 # 1 block on the stripe1 for 2 OSTs case.
2379 # 256 blocks on the stripe1 for other cases.
2380 # 1 block on the stripe2 if OSTs > 2
2381 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2382 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2383 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2385 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2386 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2387 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2390 $LFS getstripe $DIR/$tdir/a1/f0
2392 $LFS getstripe $DIR/$tdir/a1/f1
2394 $LFS getstripe $DIR/$tdir/a1/f2
2396 if [ $OSTCOUNT -gt 2 ]; then
2397 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2398 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2400 $LFS getstripe $DIR/$tdir/a1/f3
2403 cancel_lru_locks osc
2405 echo "Inject failure..."
2406 echo "To simulate f0 lost MDT-object"
2407 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2408 do_facet mds1 $LCTL set_param fail_loc=0x1616
2409 rm -f $DIR/$tdir/a1/f0
2411 echo "To simulate f1 lost MDT-object and OST-object0"
2412 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2413 do_facet mds1 $LCTL set_param fail_loc=0x161a
2414 rm -f $DIR/$tdir/a1/f1
2416 echo "To simulate f2 lost MDT-object and OST-object1"
2417 do_facet mds1 $LCTL set_param fail_val=1
2418 rm -f $DIR/$tdir/a1/f2
2420 if [ $OSTCOUNT -gt 2 ]; then
2421 echo "To simulate f3 lost MDT-object and OST-object2"
2422 do_facet mds1 $LCTL set_param fail_val=2
2423 rm -f $DIR/$tdir/a1/f3
2426 umount_client $MOUNT
2429 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2431 echo "Inject failure to slow down the LFSCK on OST0"
2432 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2433 do_facet ost1 $LCTL set_param fail_loc=0x161b
2435 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2436 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2439 do_facet ost1 $LCTL set_param fail_loc=0
2441 for k in $(seq $MDSCOUNT); do
2442 # The LFSCK status query internal is 30 seconds. For the case
2443 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2444 # time to guarantee the status sync up.
2445 wait_update_facet mds${k} "$LCTL get_param -n \
2446 mdd.$(facet_svc mds${k}).lfsck_layout |
2447 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2448 error "(2) MDS${k} is not the expected 'completed'"
2451 for k in $(seq $OSTCOUNT); do
2452 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2453 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2454 awk '/^status/ { print $2 }')
2455 [ "$cur_status" == "completed" ] ||
2456 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2459 local repaired=$(do_facet mds1 $LCTL get_param -n \
2460 mdd.$(facet_svc mds1).lfsck_layout |
2461 awk '/^repaired_orphan/ { print $2 }')
2462 if [ $OSTCOUNT -gt 2 ]; then
2463 [ $repaired -eq 9 ] ||
2464 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2466 [ $repaired -eq 4 ] ||
2467 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2470 mount_client $MOUNT || error "(5.0) Fail to start client!"
2472 LOV_PATTERN_F_HOLE=0x40000000
2475 # ${fid0}-R-0 is the old f0
2477 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2478 echo "Check $name, which is the old f0"
2480 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2482 local pattern=0x$($LFS getstripe -L $name)
2483 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2484 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2486 local stripes=$($LFS getstripe -c $name)
2487 if [ $OSTCOUNT -gt 2 ]; then
2488 [ $stripes -eq 3 ] ||
2489 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2491 [ $stripes -eq 2 ] ||
2492 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2495 local size=$(stat $name | awk '/Size:/ { print $2 }')
2496 [ $size -eq $((4096 * $bcount)) ] ||
2497 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2499 cat $name > /dev/null || error "(5.5) cannot read $name"
2501 echo "dummy" >> $name || error "(5.6) cannot write $name"
2503 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2505 touch $name || error "(5.8) cannot touch $name"
2507 rm -f $name || error "(5.9) cannot unlink $name"
2510 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2512 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2513 if [ $OSTCOUNT -gt 2 ]; then
2514 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2516 echo "Check $name, it contains the old f1's stripe1"
2519 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2521 pattern=0x$($LFS getstripe -L $name)
2522 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2523 error "(6.2) expect pattern flag hole, but got $pattern"
2525 stripes=$($LFS getstripe -c $name)
2526 if [ $OSTCOUNT -gt 2 ]; then
2527 [ $stripes -eq 3 ] ||
2528 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2530 [ $stripes -eq 2 ] ||
2531 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2534 size=$(stat $name | awk '/Size:/ { print $2 }')
2535 [ $size -eq $((4096 * $bcount)) ] ||
2536 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2538 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2540 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2541 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2544 [ $failures -eq 256 ] ||
2545 error "(6.6) expect 256 IO failures, but get $failures"
2547 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2548 [ $size -eq $((4096 * $bcount)) ] ||
2549 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2551 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2552 error "(6.8) write to the LOV EA hole should fail"
2554 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2555 error "(6.9) write to normal stripe should NOT fail"
2557 echo "foo" >> $name && error "(6.10) append write $name should fail"
2559 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2561 touch $name || error "(6.12) cannot touch $name"
2563 rm -f $name || error "(6.13) cannot unlink $name"
2566 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2568 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2569 if [ $OSTCOUNT -gt 2 ]; then
2570 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2572 echo "Check $name, it contains the old f2's stripe0"
2575 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2577 pattern=0x$($LFS getstripe -L $name)
2578 stripes=$($LFS getstripe -c $name)
2579 size=$(stat $name | awk '/Size:/ { print $2 }')
2580 if [ $OSTCOUNT -gt 2 ]; then
2581 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2582 error "(7.2.1) expect pattern flag hole, but got $pattern"
2584 [ $stripes -eq 3 ] ||
2585 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2587 [ $size -eq $((4096 * $bcount)) ] ||
2588 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2590 cat $name > /dev/null &&
2591 error "(7.5.1) normal read $name should fail"
2593 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2594 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2596 [ $failures -eq 256 ] ||
2597 error "(7.6) expect 256 IO failures, but get $failures"
2599 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2600 [ $size -eq $((4096 * $bcount)) ] ||
2601 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2603 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2604 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2606 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2607 error "(7.8.1) write to normal stripe should NOT fail"
2609 echo "foo" >> $name &&
2610 error "(7.8.3) append write $name should fail"
2612 chown $RUNAS_ID:$RUNAS_GID $name ||
2613 error "(7.9.1) cannot chown on $name"
2615 touch $name || error "(7.10.1) cannot touch $name"
2617 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2618 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2620 [ $stripes -eq 1 ] ||
2621 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2624 [ $size -eq $((4096 * (256 + 0))) ] ||
2625 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2627 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2629 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2631 chown $RUNAS_ID:$RUNAS_GID $name ||
2632 error "(7.9.2) cannot chown on $name"
2634 touch $name || error "(7.10.2) cannot touch $name"
2637 rm -f $name || error "(7.11) cannot unlink $name"
2639 [ $OSTCOUNT -le 2 ] && return
2642 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2644 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2645 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2647 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2649 pattern=0x$($LFS getstripe -L $name)
2650 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2651 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2653 stripes=$($LFS getstripe -c $name)
2654 # LFSCK does not know the old f3 had 3 stripes.
2655 # It only tries to find as much as possible.
2656 # The stripe count depends on the last stripe's offset.
2657 [ $stripes -eq 2 ] ||
2658 error "(8.3) expect the stripe count is 2, but got $stripes"
2660 size=$(stat $name | awk '/Size:/ { print $2 }')
2662 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2663 error "(8.4) expect the size $((4096 * 512)), but got $size"
2665 cat $name > /dev/null || error "(8.5) cannot read $name"
2667 echo "dummy" >> $name || error "(8.6) cannot write $name"
2669 chown $RUNAS_ID:$RUNAS_GID $name ||
2670 error "(8.7) cannot chown on $name"
2672 touch $name || error "(8.8) cannot touch $name"
2674 rm -f $name || error "(8.9) cannot unlink $name"
2676 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2679 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2680 skip "ignore the test if MDS is older than 2.5.59" && exit 0
2682 check_mount_and_prep
2683 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2685 echo "Start all LFSCK components by default (-s 1)"
2686 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2687 error "Fail to start LFSCK"
2689 echo "namespace LFSCK should be in 'scanning-phase1' status"
2690 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2691 [ "$STATUS" == "scanning-phase1" ] ||
2692 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2694 echo "layout LFSCK should be in 'scanning-phase1' status"
2695 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2696 [ "$STATUS" == "scanning-phase1" ] ||
2697 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2699 echo "Stop all LFSCK components by default"
2700 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2701 error "Fail to stop LFSCK"
2703 run_test 21 "run all LFSCK components by default"
2705 $LCTL set_param debug=-lfsck > /dev/null || true
2707 # restore MDS/OST size
2708 MDSSIZE=${SAVED_MDSSIZE}
2709 OSTSIZE=${SAVED_OSTSIZE}
2710 OSTCOUNT=${SAVED_OSTCOUNT}
2712 # cleanup the system at last