3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
10 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
11 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
12 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
14 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 require_dsh_mds || exit 0
22 SAVED_MDSSIZE=${MDSSIZE}
23 SAVED_OSTSIZE=${OSTSIZE}
24 SAVED_OSTCOUNT=${OSTCOUNT}
25 # use small MDS + OST size to speed formatting time
26 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
29 # no need too much OSTs, to reduce the format/start/stop overhead
30 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
32 # build up a clean test environment.
36 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
37 skip "Need MDS version at least 2.3.60" && check_and_cleanup_lustre &&
40 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
41 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
43 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
44 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
46 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
47 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23"
51 $LCTL set_param debug=+lfsck > /dev/null || true
53 MDT_DEV="${FSNAME}-MDT0000"
54 OST_DEV="${FSNAME}-OST0000"
55 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="-o user_xattr"
69 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
78 echo "preparing... $nfiles * $ndirs files will be created $(date)."
79 if [ ! -z $igif ]; then
80 #define OBD_FAIL_FID_IGIF 0x1504
81 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
84 cp $LUSTRE/tests/*.sh $DIR/$tdir/
85 if [ $ndirs -gt 0 ]; then
86 createmany -d $DIR/$tdir/d $ndirs
87 createmany -m $DIR/$tdir/f $ndirs
88 if [ $nfiles -gt 0 ]; then
89 for ((i = 0; i < $ndirs; i++)); do
90 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
91 /dev/null || error "createmany $nfiles"
94 createmany -d $DIR/$tdir/e $ndirs
97 if [ ! -z $igif ]; then
98 touch $DIR/$tdir/dummy
99 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
102 echo "prepared $(date)."
108 #define OBD_FAIL_LFSCK_DELAY1 0x1600
109 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
110 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
112 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
114 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
115 [ "$STATUS" == "scanning-phase1" ] ||
116 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
118 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
120 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
121 [ "$STATUS" == "stopped" ] ||
122 error "(6) Expect 'stopped', but got '$STATUS'"
124 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
126 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
127 [ "$STATUS" == "scanning-phase1" ] ||
128 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
130 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
131 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
132 mdd.${MDT_DEV}.lfsck_namespace |
133 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
135 error "(9) unexpected status"
138 local repaired=$($SHOW_NAMESPACE |
139 awk '/^updated_phase1/ { print $2 }')
140 [ $repaired -eq 0 ] ||
141 error "(10) Expect nothing to be repaired, but got: $repaired"
143 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
144 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
145 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
146 mdd.${MDT_DEV}.lfsck_namespace |
147 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
149 error "(12) unexpected status"
152 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
153 [ $((scanned1 + 1)) -eq $scanned2 ] ||
154 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
156 echo "stopall, should NOT crash LU-3649"
157 stopall || error "(14) Fail to stopall"
159 run_test 0 "Control LFSCK manually"
162 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
163 skip "OI Scrub not implemented for ZFS" && return
167 #define OBD_FAIL_FID_INDIR 0x1501
168 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
169 touch $DIR/$tdir/dummy
171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
173 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
174 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
175 mdd.${MDT_DEV}.lfsck_namespace |
176 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
178 error "(4) unexpected status"
181 local repaired=$($SHOW_NAMESPACE |
182 awk '/^dirent_repaired/ { print $2 }')
183 # for interop with old server
184 [ -z "$repaired" ] &&
185 repaired=$($SHOW_NAMESPACE |
186 awk '/^updated_phase1/ { print $2 }')
188 [ $repaired -eq 1 ] ||
189 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
191 mount_client $MOUNT || error "(6) Fail to start client!"
193 #define OBD_FAIL_FID_LOOKUP 0x1505
194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
195 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
197 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
199 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
203 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
204 skip "OI Scrub not implemented for ZFS" && return
208 #define OBD_FAIL_FID_INLMA 0x1502
209 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
210 touch $DIR/$tdir/dummy
212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
214 #define OBD_FAIL_FID_NOLMA 0x1506
215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
216 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
217 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
218 mdd.${MDT_DEV}.lfsck_namespace |
219 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
221 error "(4) unexpected status"
224 local repaired=$($SHOW_NAMESPACE |
225 awk '/^dirent_repaired/ { print $2 }')
226 # for interop with old server
227 [ -z "$repaired" ] &&
228 repaired=$($SHOW_NAMESPACE |
229 awk '/^updated_phase1/ { print $2 }')
231 [ $repaired -eq 1 ] ||
232 error "(5) Fail to repair missed FID-in-LMA: $repaired"
234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
235 mount_client $MOUNT || error "(6) Fail to start client!"
237 #define OBD_FAIL_FID_LOOKUP 0x1505
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
239 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
243 run_test 1b "LFSCK can find out and repair missed FID-in-LMA"
248 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
249 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
250 touch $DIR/$tdir/dummy
252 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
254 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
255 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
256 mdd.${MDT_DEV}.lfsck_namespace |
257 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
259 error "(4) unexpected status"
262 local repaired=$($SHOW_NAMESPACE |
263 awk '/^linkea_repaired/ { print $2 }')
264 # for interop with old server
265 [ -z "$repaired" ] &&
266 repaired=$($SHOW_NAMESPACE |
267 awk '/^updated_phase2/ { print $2 }')
269 [ $repaired -eq 1 ] ||
270 error "(5) Fail to repair crashed linkEA: $repaired"
272 mount_client $MOUNT || error "(6) Fail to start client!"
274 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
275 error "(7) Fail to stat $DIR/$tdir/dummy"
277 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
278 local dummyname=$($LFS fid2path $DIR $dummyfid)
279 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
280 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
282 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
288 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
289 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
290 touch $DIR/$tdir/dummy
292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
294 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
295 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
296 mdd.${MDT_DEV}.lfsck_namespace |
297 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
299 error "(4) unexpected status"
302 local repaired=$($SHOW_NAMESPACE |
303 awk '/^updated_phase2/ { print $2 }')
304 [ $repaired -eq 1 ] ||
305 error "(5) Fail to repair crashed linkEA: $repaired"
307 mount_client $MOUNT || error "(6) Fail to start client!"
309 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
310 error "(7) Fail to stat $DIR/$tdir/dummy"
312 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
313 local dummyname=$($LFS fid2path $DIR $dummyfid)
314 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
315 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
317 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
323 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
325 touch $DIR/$tdir/dummy
327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
329 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
330 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
331 mdd.${MDT_DEV}.lfsck_namespace |
332 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
334 error "(4) unexpected status"
337 local repaired=$($SHOW_NAMESPACE |
338 awk '/^updated_phase2/ { print $2 }')
339 [ $repaired -eq 1 ] ||
340 error "(5) Fail to repair crashed linkEA: $repaired"
342 mount_client $MOUNT || error "(6) Fail to start client!"
344 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
345 error "(7) Fail to stat $DIR/$tdir/dummy"
347 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
348 local dummyname=$($LFS fid2path $DIR $dummyfid)
349 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
350 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
352 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
358 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
359 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
360 touch $DIR/$tdir/dummy
362 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
364 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
365 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
366 mdd.${MDT_DEV}.lfsck_namespace |
367 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
369 error "(4) unexpected status"
372 local repaired=$($SHOW_NAMESPACE |
373 awk '/^linkea_repaired/ { print $2 }')
374 [ $repaired -eq 1 ] ||
375 error "(5) Fail to repair crashed linkEA: $repaired"
377 mount_client $MOUNT || error "(6) Fail to start client!"
379 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
380 error "(7) Fail to stat $DIR/$tdir/dummy"
382 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
383 local dummyname=$($LFS fid2path $DIR $dummyfid)
384 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
385 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
387 run_test 2d "LFSCK can recover the missed linkEA entry"
391 [ $MDSCOUNT -lt 2 ] &&
392 skip "We need at least 2 MDSes for this test" && return
396 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
398 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
399 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
400 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
401 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
403 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
404 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
405 mdd.${MDT_DEV}.lfsck_namespace |
406 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
408 error "(4) unexpected status"
411 local repaired=$($SHOW_NAMESPACE |
412 awk '/^linkea_repaired/ { print $2 }')
413 [ $repaired -eq 1 ] ||
414 error "(5) Fail to repair crashed linkEA: $repaired"
416 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
417 local name=$($LFS fid2path $DIR $fid)
418 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
419 error "(6) Fail to repair linkEA: $fid $name"
421 run_test 2e "namespace LFSCK can verify remote object linkEA"
427 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
428 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
429 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
431 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
432 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
433 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
435 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
436 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
437 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
439 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
441 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
445 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
446 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
447 mdd.${MDT_DEV}.lfsck_namespace |
448 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
450 error "(10) unexpected status"
453 local checked=$($SHOW_NAMESPACE |
454 awk '/^checked_phase2/ { print $2 }')
455 [ $checked -ge 4 ] ||
456 error "(11) Fail to check multiple-linked object: $checked"
458 local repaired=$($SHOW_NAMESPACE |
459 awk '/^multiple_linked_repaired/ { print $2 }')
460 [ $repaired -ge 2 ] ||
461 error "(12) Fail to repair multiple-linked object: $repaired"
463 run_test 3 "LFSCK can verify multiple-linked objects"
467 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
468 skip "OI Scrub not implemented for ZFS" && return
471 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
472 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
474 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
475 echo "start $SINGLEMDS with disabling OI scrub"
476 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
477 error "(2) Fail to start MDS!"
479 #define OBD_FAIL_LFSCK_DELAY2 0x1601
480 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
481 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
482 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
483 mdd.${MDT_DEV}.lfsck_namespace |
484 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
486 error "(5) unexpected status"
489 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
490 [ "$STATUS" == "scanning-phase1" ] ||
491 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
493 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
494 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
495 mdd.${MDT_DEV}.lfsck_namespace |
496 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
498 error "(7) unexpected status"
501 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
502 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
504 local repaired=$($SHOW_NAMESPACE |
505 awk '/^dirent_repaired/ { print $2 }')
506 # for interop with old server
507 [ -z "$repaired" ] &&
508 repaired=$($SHOW_NAMESPACE |
509 awk '/^updated_phase1/ { print $2 }')
511 [ $repaired -ge 9 ] ||
512 error "(9) Fail to re-generate FID-in-dirent: $repaired"
514 mount_client $MOUNT || error "(10) Fail to start client!"
516 #define OBD_FAIL_FID_LOOKUP 0x1505
517 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
518 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
519 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
521 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
525 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
526 skip "OI Scrub not implemented for ZFS" && return
529 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
530 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
532 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
533 echo "start $SINGLEMDS with disabling OI scrub"
534 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
535 error "(2) Fail to start MDS!"
537 #define OBD_FAIL_LFSCK_DELAY2 0x1601
538 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
539 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
540 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
541 mdd.${MDT_DEV}.lfsck_namespace |
542 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
544 error "(5) unexpected status"
547 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
548 [ "$STATUS" == "scanning-phase1" ] ||
549 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
551 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
552 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
553 mdd.${MDT_DEV}.lfsck_namespace |
554 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
556 error "(7) unexpected status"
559 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
560 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
562 local repaired=$($SHOW_NAMESPACE |
563 awk '/^dirent_repaired/ { print $2 }')
564 # for interop with old server
565 [ -z "$repaired" ] &&
566 repaired=$($SHOW_NAMESPACE |
567 awk '/^updated_phase1/ { print $2 }')
569 [ $repaired -ge 2 ] ||
570 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
572 mount_client $MOUNT || error "(10) Fail to start client!"
574 #define OBD_FAIL_FID_LOOKUP 0x1505
575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
576 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
578 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
581 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
582 local dummyname=$($LFS fid2path $DIR $dummyfid)
583 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
584 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
586 run_test 5 "LFSCK can handle IGIF object upgrading"
591 #define OBD_FAIL_LFSCK_DELAY1 0x1600
592 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
593 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
595 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
596 [ "$STATUS" == "scanning-phase1" ] ||
597 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
599 # Sleep 3 sec to guarantee at least one object processed by LFSCK
601 # Fail the LFSCK to guarantee there is at least one checkpoint
602 #define OBD_FAIL_LFSCK_FATAL1 0x1608
603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
608 error "(4) unexpected status"
611 local POS0=$($SHOW_NAMESPACE |
612 awk '/^last_checkpoint_position/ { print $2 }' |
615 #define OBD_FAIL_LFSCK_DELAY1 0x1600
616 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
617 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
619 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
620 [ "$STATUS" == "scanning-phase1" ] ||
621 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
623 local POS1=$($SHOW_NAMESPACE |
624 awk '/^latest_start_position/ { print $2 }' |
626 [[ $POS0 -lt $POS1 ]] ||
627 error "(7) Expect larger than: $POS0, but got $POS1"
629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
630 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
631 mdd.${MDT_DEV}.lfsck_namespace |
632 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
634 error "(8) unexpected status"
637 run_test 6a "LFSCK resumes from last checkpoint (1)"
642 #define OBD_FAIL_LFSCK_DELAY2 0x1601
643 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
644 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
646 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
647 [ "$STATUS" == "scanning-phase1" ] ||
648 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
650 # Sleep 5 sec to guarantee that we are in the directory scanning
652 # Fail the LFSCK to guarantee there is at least one checkpoint
653 #define OBD_FAIL_LFSCK_FATAL2 0x1609
654 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
655 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
656 mdd.${MDT_DEV}.lfsck_namespace |
657 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
659 error "(4) unexpected status"
662 local O_POS0=$($SHOW_NAMESPACE |
663 awk '/^last_checkpoint_position/ { print $2 }' |
666 local D_POS0=$($SHOW_NAMESPACE |
667 awk '/^last_checkpoint_position/ { print $4 }')
669 #define OBD_FAIL_LFSCK_DELAY2 0x1601
670 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
671 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
673 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
674 [ "$STATUS" == "scanning-phase1" ] ||
675 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
677 local O_POS1=$($SHOW_NAMESPACE |
678 awk '/^latest_start_position/ { print $2 }' |
680 local D_POS1=$($SHOW_NAMESPACE |
681 awk '/^latest_start_position/ { print $4 }')
683 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
684 [[ $O_POS0 -lt $O_POS1 ]] ||
685 error "(7.1) $O_POS1 is not larger than $O_POS0"
687 [[ $D_POS0 -lt $D_POS1 ]] ||
688 error "(7.2) $D_POS1 is not larger than $D_POS0"
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
692 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
693 mdd.${MDT_DEV}.lfsck_namespace |
694 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
696 error "(8) unexpected status"
699 run_test 6b "LFSCK resumes from last checkpoint (2)"
706 #define OBD_FAIL_LFSCK_DELAY2 0x1601
707 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
708 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
710 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
711 [ "$STATUS" == "scanning-phase1" ] ||
712 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
714 # Sleep 3 sec to guarantee at least one object processed by LFSCK
716 echo "stop $SINGLEMDS"
717 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
719 echo "start $SINGLEMDS"
720 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
721 error "(5) Fail to start MDS!"
723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
724 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
725 mdd.${MDT_DEV}.lfsck_namespace |
726 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
728 error "(6) unexpected status"
731 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
737 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
738 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
739 for ((i = 0; i < 20; i++)); do
740 touch $DIR/$tdir/dummy${i}
743 #define OBD_FAIL_LFSCK_DELAY3 0x1602
744 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
745 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
746 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
747 mdd.${MDT_DEV}.lfsck_namespace |
748 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
750 error "(4) unexpected status"
753 echo "stop $SINGLEMDS"
754 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
756 echo "start $SINGLEMDS"
757 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
758 error "(6) Fail to start MDS!"
760 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
761 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
762 mdd.${MDT_DEV}.lfsck_namespace |
763 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
765 error "(7) unexpected status"
768 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
773 formatall > /dev/null
779 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
780 [ "$STATUS" == "init" ] ||
781 error "(2) Expect 'init', but got '$STATUS'"
783 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
785 mkdir $DIR/$tdir/crashed
787 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
788 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
789 for ((i = 0; i < 5; i++)); do
790 touch $DIR/$tdir/dummy${i}
793 umount_client $MOUNT || error "(3) Fail to stop client!"
795 #define OBD_FAIL_LFSCK_DELAY2 0x1601
796 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
797 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
799 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
800 [ "$STATUS" == "scanning-phase1" ] ||
801 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
803 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
805 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
806 [ "$STATUS" == "stopped" ] ||
807 error "(7) Expect 'stopped', but got '$STATUS'"
809 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
811 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
812 [ "$STATUS" == "scanning-phase1" ] ||
813 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
815 #define OBD_FAIL_LFSCK_FATAL2 0x1609
816 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
817 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
818 mdd.${MDT_DEV}.lfsck_namespace |
819 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
821 error "(10) unexpected status"
824 #define OBD_FAIL_LFSCK_DELAY1 0x1600
825 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
826 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
828 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
829 [ "$STATUS" == "scanning-phase1" ] ||
830 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
832 #define OBD_FAIL_LFSCK_CRASH 0x160a
833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
836 echo "stop $SINGLEMDS"
837 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
839 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
840 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
842 echo "start $SINGLEMDS"
843 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
844 error "(14) Fail to start MDS!"
846 local timeout=$(max_recovery_time)
849 while [ $timer -lt $timeout ]; do
850 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
851 mdt.${MDT_DEV}.recovery_status |
852 awk '/^status/ { print \\\$2 }'")
853 [ "$STATUS" != "RECOVERING" ] && break;
858 [ $timer != $timeout ] ||
859 error "(14.1) recovery timeout"
861 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
862 [ "$STATUS" == "crashed" ] ||
863 error "(15) Expect 'crashed', but got '$STATUS'"
865 #define OBD_FAIL_LFSCK_DELAY2 0x1601
866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
867 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
869 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
870 [ "$STATUS" == "scanning-phase1" ] ||
871 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
873 echo "stop $SINGLEMDS"
874 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
876 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
877 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
879 echo "start $SINGLEMDS"
880 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
881 error "(19) Fail to start MDS!"
884 while [ $timer -lt $timeout ]; do
885 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
886 mdt.${MDT_DEV}.recovery_status |
887 awk '/^status/ { print \\\$2 }'")
888 [ "$STATUS" != "RECOVERING" ] && break;
893 [ $timer != $timeout ] ||
894 error "(19.1) recovery timeout"
896 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
897 [ "$STATUS" == "paused" ] ||
898 error "(20) Expect 'paused', but got '$STATUS'"
900 #define OBD_FAIL_LFSCK_DELAY3 0x1602
901 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
903 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
904 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
905 mdd.${MDT_DEV}.lfsck_namespace |
906 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
908 error "(22) unexpected status"
911 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
912 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
913 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
915 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
916 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
917 mdd.${MDT_DEV}.lfsck_namespace |
918 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
920 error "(24) unexpected status"
923 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
924 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
926 run_test 8 "LFSCK state machine"
929 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
930 skip "Testing on UP system, the speed may be inaccurate."
936 local BASE_SPEED1=100
938 $START_NAMESPACE -r -s $BASE_SPEED1 || error "(3) Fail to start LFSCK!"
941 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
942 [ "$STATUS" == "scanning-phase1" ] ||
943 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
945 local SPEED=$($SHOW_NAMESPACE |
946 awk '/^average_speed_phase1/ { print $2 }')
948 # There may be time error, normally it should be less than 2 seconds.
949 # We allow another 20% schedule error.
951 # MAX_MARGIN = 1.2 = 12 / 10
952 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
953 RUN_TIME1 * 12 / 10))
954 [ $SPEED -lt $MAX_SPEED ] ||
955 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
958 local BASE_SPEED2=300
960 do_facet $SINGLEMDS \
961 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
964 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase1/ { print $2 }')
965 # MIN_MARGIN = 0.8 = 8 / 10
966 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
967 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
968 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
969 # Account for slow ZFS performance - LU-4934
970 [ $SPEED -gt $MIN_SPEED ] || [ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
971 error "(5) Got speed $SPEED, expected more than $MIN_SPEED"
973 # MAX_MARGIN = 1.2 = 12 / 10
974 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
975 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
976 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
977 [ $SPEED -lt $MAX_SPEED ] ||
978 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
980 do_facet $SINGLEMDS \
981 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
983 wait_update_facet $SINGLEMDS \
984 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace|\
985 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
986 error "(7) Failed to get expected 'completed'"
988 run_test 9a "LFSCK speed control (1)"
991 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
992 skip "Testing on UP system, the speed may be inaccurate."
998 echo "Preparing another 50 * 50 files (with error) at $(date)."
999 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1000 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1001 createmany -d $DIR/$tdir/d 50
1002 createmany -m $DIR/$tdir/f 50
1003 for ((i = 0; i < 50; i++)); do
1004 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1007 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1009 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1010 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1011 mdd.${MDT_DEV}.lfsck_namespace |
1012 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1014 error "(5) unexpected status"
1017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1018 echo "Prepared at $(date)."
1020 local BASE_SPEED1=50
1022 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1025 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1026 [ "$STATUS" == "scanning-phase2" ] ||
1027 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1029 local SPEED=$($SHOW_NAMESPACE |
1030 awk '/^average_speed_phase2/ { print $2 }')
1031 # There may be time error, normally it should be less than 2 seconds.
1032 # We allow another 20% schedule error.
1034 # MAX_MARGIN = 1.2 = 12 / 10
1035 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1036 RUN_TIME1 * 12 / 10))
1037 [ $SPEED -lt $MAX_SPEED ] ||
1038 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1040 # adjust speed limit
1041 local BASE_SPEED2=150
1043 do_facet $SINGLEMDS \
1044 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1047 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1048 # MIN_MARGIN = 0.8 = 8 / 10
1049 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1050 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1051 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1052 [ $SPEED -gt $MIN_SPEED ] ||[ $(facet_fstype $SINGLEMDS) -eq zfs ] ||
1053 error "(9) Got speed $SPEED, expected more than $MIN_SPEED"
1055 # MAX_MARGIN = 1.2 = 12 / 10
1056 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1057 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1058 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1059 [ $SPEED -lt $MAX_SPEED ] ||
1060 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1062 do_facet $SINGLEMDS \
1063 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1064 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1065 mdd.${MDT_DEV}.lfsck_namespace |
1066 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1068 error "(11) unexpected status"
1071 run_test 9b "LFSCK speed control (2)"
1075 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1076 skip "lookup(..)/linkea on ZFS issue" && return
1080 echo "Preparing more files with error at $(date)."
1081 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1082 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1084 for ((i = 0; i < 1000; i = $((i+2)))); do
1085 mkdir -p $DIR/$tdir/d${i}
1086 touch $DIR/$tdir/f${i}
1087 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1090 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1091 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1093 for ((i = 1; i < 1000; i = $((i+2)))); do
1094 mkdir -p $DIR/$tdir/d${i}
1095 touch $DIR/$tdir/f${i}
1096 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1099 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1100 echo "Prepared at $(date)."
1102 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1104 umount_client $MOUNT
1105 mount_client $MOUNT || error "(3) Fail to start client!"
1107 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1110 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1111 [ "$STATUS" == "scanning-phase1" ] ||
1112 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1114 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1116 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1118 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1120 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1122 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1124 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1126 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1128 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1129 error "(14) Fail to softlink!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase1" ] ||
1133 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1135 do_facet $SINGLEMDS \
1136 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1138 mdd.${MDT_DEV}.lfsck_namespace |
1139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1141 error "(16) unexpected status"
1144 run_test 10 "System is available during LFSCK scanning"
1147 ost_remove_lastid() {
1150 local rcmd="do_facet ost${ost}"
1152 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1154 # step 1: local mount
1155 mount_fstype ost${ost} || return 1
1156 # step 2: remove the specified LAST_ID
1157 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1159 unmount_fstype ost${ost} || return 2
1163 check_mount_and_prep
1164 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1165 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1170 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1172 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1173 error "(2) Fail to start ost1"
1175 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1176 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1178 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1179 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1181 wait_update_facet ost1 "$LCTL get_param -n \
1182 obdfilter.${OST_DEV}.lfsck_layout |
1183 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1185 error "(5) unexpected status"
1188 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1190 wait_update_facet ost1 "$LCTL get_param -n \
1191 obdfilter.${OST_DEV}.lfsck_layout |
1192 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1194 error "(6) unexpected status"
1197 echo "the LAST_ID(s) should have been rebuilt"
1198 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1199 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1201 run_test 11a "LFSCK can rebuild lost last_id"
1204 check_mount_and_prep
1205 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1207 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1208 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1209 do_facet ost1 $LCTL set_param fail_loc=0x160d
1210 createmany -o $DIR/$tdir/f 64
1211 local lastid1=$(do_facet ost1 "lctl get_param -n \
1212 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1213 awk -F: '{ print $2 }')
1215 umount_client $MOUNT
1216 stop ost1 || error "(1) Fail to stop ost1"
1218 #define OBD_FAIL_OST_ENOSPC 0x215
1219 do_facet ost1 $LCTL set_param fail_loc=0x215
1221 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1222 error "(2) Fail to start ost1"
1224 for ((i = 0; i < 60; i++)); do
1225 lastid2=$(do_facet ost1 "lctl get_param -n \
1226 obdfilter.${ost1_svc}.last_id" | grep 0x100000000 |
1227 awk -F: '{ print $2 }')
1228 [ ! -z $lastid2 ] && break;
1232 echo "the on-disk LAST_ID should be smaller than the expected one"
1233 [ $lastid1 -gt $lastid2 ] ||
1234 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1236 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1237 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1239 wait_update_facet ost1 "$LCTL get_param -n \
1240 obdfilter.${OST_DEV}.lfsck_layout |
1241 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1243 error "(6) unexpected status"
1246 stop ost1 || error "(7) Fail to stop ost1"
1248 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1249 error "(8) Fail to start ost1"
1251 echo "the on-disk LAST_ID should have been rebuilt"
1252 wait_update_facet ost1 "$LCTL get_param -n \
1253 obdfilter.${ost1_svc}.last_id | grep 0x100000000 |
1254 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1255 $LCTL get_param -n obdfilter.${ost1_svc}.last_id
1256 error "(9) expect lastid1 0x100000000:$lastid1"
1259 do_facet ost1 $LCTL set_param fail_loc=0
1260 stopall || error "(10) Fail to stopall"
1262 run_test 11b "LFSCK can rebuild crashed last_id"
1265 [ $MDSCOUNT -lt 2 ] &&
1266 skip "We need at least 2 MDSes for test_12" && return
1268 check_mount_and_prep
1269 for k in $(seq $MDSCOUNT); do
1270 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1271 createmany -o $DIR/$tdir/${k}/f 100 ||
1272 error "(0) Fail to create 100 files."
1275 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1276 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1277 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1279 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1280 for k in $(seq $MDSCOUNT); do
1281 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1282 mdd.$(facet_svc mds${k}).lfsck_namespace |
1283 awk '/^status/ { print $2 }')
1284 [ "$STATUS" == "scanning-phase1" ] ||
1285 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1288 echo "Stop namespace LFSCK on all targets by single lctl command."
1289 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1290 error "(4) Fail to stop LFSCK on all devices!"
1292 echo "All the LFSCK targets should be in 'stopped' status."
1293 for k in $(seq $MDSCOUNT); do
1294 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1295 mdd.$(facet_svc mds${k}).lfsck_namespace |
1296 awk '/^status/ { print $2 }')
1297 [ "$STATUS" == "stopped" ] ||
1298 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1301 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1302 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1303 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1305 echo "All the LFSCK targets should be in 'completed' status."
1306 for k in $(seq $MDSCOUNT); do
1307 wait_update_facet mds${k} "$LCTL get_param -n \
1308 mdd.$(facet_svc mds${k}).lfsck_namespace |
1309 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1310 error "(7) MDS${k} is not the expected 'completed'"
1313 echo "Start layout LFSCK on all targets by single command (-s 1)."
1314 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1315 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1317 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1318 for k in $(seq $MDSCOUNT); do
1319 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1320 mdd.$(facet_svc mds${k}).lfsck_layout |
1321 awk '/^status/ { print $2 }')
1322 [ "$STATUS" == "scanning-phase1" ] ||
1323 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1326 echo "Stop layout LFSCK on all targets by single lctl command."
1327 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1328 error "(10) Fail to stop LFSCK on all devices!"
1330 echo "All the LFSCK targets should be in 'stopped' status."
1331 for k in $(seq $MDSCOUNT); do
1332 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1333 mdd.$(facet_svc mds${k}).lfsck_layout |
1334 awk '/^status/ { print $2 }')
1335 [ "$STATUS" == "stopped" ] ||
1336 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1339 for k in $(seq $OSTCOUNT); do
1340 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1341 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1342 awk '/^status/ { print $2 }')
1343 [ "$STATUS" == "stopped" ] ||
1344 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1347 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1348 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1349 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1351 echo "All the LFSCK targets should be in 'completed' status."
1352 for k in $(seq $MDSCOUNT); do
1353 # The LFSCK status query internal is 30 seconds. For the case
1354 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1355 # time to guarantee the status sync up.
1356 wait_update_facet mds${k} "$LCTL get_param -n \
1357 mdd.$(facet_svc mds${k}).lfsck_layout |
1358 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1359 error "(14) MDS${k} is not the expected 'completed'"
1362 run_test 12 "single command to trigger LFSCK on all devices"
1366 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1367 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1368 echo "MDT-object FID."
1371 check_mount_and_prep
1373 echo "Inject failure stub to simulate bad lmm_oi"
1374 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1375 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1376 createmany -o $DIR/$tdir/f 32
1377 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1379 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1380 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1382 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1383 mdd.${MDT_DEV}.lfsck_layout |
1384 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1386 error "(2) unexpected status"
1389 local repaired=$($SHOW_LAYOUT |
1390 awk '/^repaired_others/ { print $2 }')
1391 [ $repaired -eq 32 ] ||
1392 error "(3) Fail to repair crashed lmm_oi: $repaired"
1394 run_test 13 "LFSCK can repair crashed lmm_oi"
1398 echo "The OST-object referenced by the MDT-object should be there;"
1399 echo "otherwise, the LFSCK should re-create the missed OST-object."
1402 check_mount_and_prep
1403 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1405 local count=$(precreated_ost_obj_count 0 0)
1407 echo "Inject failure stub to simulate dangling referenced MDT-object"
1408 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1409 do_facet ost1 $LCTL set_param fail_loc=0x1610
1410 createmany -o $DIR/$tdir/f $((count + 31))
1411 touch $DIR/$tdir/guard
1412 do_facet ost1 $LCTL set_param fail_loc=0
1414 start_full_debug_logging
1416 # exhaust other pre-created dangling cases
1417 count=$(precreated_ost_obj_count 0 0)
1418 createmany -o $DIR/$tdir/a $count ||
1419 error "(0) Fail to create $count files."
1421 echo "'ls' should fail because of dangling referenced MDT-object"
1422 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1424 echo "Trigger layout LFSCK to find out dangling reference"
1425 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1427 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1428 mdd.${MDT_DEV}.lfsck_layout |
1429 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1431 error "(3) unexpected status"
1434 local repaired=$($SHOW_LAYOUT |
1435 awk '/^repaired_dangling/ { print $2 }')
1436 [ $repaired -ge 32 ] ||
1437 error "(4) Fail to repair dangling reference: $repaired"
1439 echo "'stat' should fail because of not repair dangling by default"
1440 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1442 echo "Trigger layout LFSCK to repair dangling reference"
1443 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1445 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1446 mdd.${MDT_DEV}.lfsck_layout |
1447 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1449 error "(7) unexpected status"
1452 # There may be some async LFSCK updates in processing, wait for
1453 # a while until the target reparation has been done. LU-4970.
1455 echo "'stat' should success after layout LFSCK repairing"
1456 wait_update_facet client "stat $DIR/$tdir/guard |
1457 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1458 stat $DIR/$tdir/guard
1460 error "(8) unexpected size"
1463 repaired=$($SHOW_LAYOUT |
1464 awk '/^repaired_dangling/ { print $2 }')
1465 [ $repaired -ge 32 ] ||
1466 error "(9) Fail to repair dangling reference: $repaired"
1468 stop_full_debug_logging
1470 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1474 echo "If the OST-object referenced by the MDT-object back points"
1475 echo "to some non-exist MDT-object, then the LFSCK should repair"
1476 echo "the OST-object to back point to the right MDT-object."
1479 check_mount_and_prep
1480 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1482 echo "Inject failure stub to make the OST-object to back point to"
1483 echo "non-exist MDT-object."
1484 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1486 do_facet ost1 $LCTL set_param fail_loc=0x1611
1487 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1488 cancel_lru_locks osc
1489 do_facet ost1 $LCTL set_param fail_loc=0
1491 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1492 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1494 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1495 mdd.${MDT_DEV}.lfsck_layout |
1496 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1498 error "(2) unexpected status"
1501 local repaired=$($SHOW_LAYOUT |
1502 awk '/^repaired_unmatched_pair/ { print $2 }')
1503 [ $repaired -eq 1 ] ||
1504 error "(3) Fail to repair unmatched pair: $repaired"
1506 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1510 echo "If the OST-object referenced by the MDT-object back points"
1511 echo "to other MDT-object that doesn't recognize the OST-object,"
1512 echo "then the LFSCK should repair it to back point to the right"
1513 echo "MDT-object (the first one)."
1516 check_mount_and_prep
1517 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1518 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1519 cancel_lru_locks osc
1521 echo "Inject failure stub to make the OST-object to back point to"
1522 echo "other MDT-object"
1524 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1525 do_facet ost1 $LCTL set_param fail_loc=0x1612
1526 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1527 cancel_lru_locks osc
1528 do_facet ost1 $LCTL set_param fail_loc=0
1530 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1531 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1533 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1534 mdd.${MDT_DEV}.lfsck_layout |
1535 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1537 error "(2) unexpected status"
1540 local repaired=$($SHOW_LAYOUT |
1541 awk '/^repaired_unmatched_pair/ { print $2 }')
1542 [ $repaired -eq 1 ] ||
1543 error "(3) Fail to repair unmatched pair: $repaired"
1545 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1549 echo "If the OST-object's owner information does not match the owner"
1550 echo "information stored in the MDT-object, then the LFSCK trust the"
1551 echo "MDT-object and update the OST-object's owner information."
1554 check_mount_and_prep
1555 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1556 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1557 cancel_lru_locks osc
1559 echo "Inject failure stub to skip OST-object owner changing"
1560 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1561 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1562 chown 1.1 $DIR/$tdir/f0
1563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1565 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1568 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1570 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1571 mdd.${MDT_DEV}.lfsck_layout |
1572 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1574 error "(2) unexpected status"
1577 local repaired=$($SHOW_LAYOUT |
1578 awk '/^repaired_inconsistent_owner/ { print $2 }')
1579 [ $repaired -eq 1 ] ||
1580 error "(3) Fail to repair inconsistent owner: $repaired"
1582 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1586 echo "If more than one MDT-objects reference the same OST-object,"
1587 echo "and the OST-object only recognizes one MDT-object, then the"
1588 echo "LFSCK should create new OST-objects for such non-recognized"
1592 check_mount_and_prep
1593 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1595 echo "Inject failure stub to make two MDT-objects to refernce"
1596 echo "the OST-object"
1598 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1599 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1601 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1602 cancel_lru_locks osc
1604 createmany -o $DIR/$tdir/f 1
1606 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1608 cancel_lru_locks mdc
1609 cancel_lru_locks osc
1611 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1612 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1613 [ $size -eq 1048576 ] ||
1614 error "(1) f0 (wrong) size should be 1048576, but got $size"
1616 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1619 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1622 mdd.${MDT_DEV}.lfsck_layout |
1623 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1625 error "(3) unexpected status"
1628 local repaired=$($SHOW_LAYOUT |
1629 awk '/^repaired_multiple_referenced/ { print $2 }')
1630 [ $repaired -eq 1 ] ||
1631 error "(4) Fail to repair multiple references: $repaired"
1633 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1634 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1635 error "(5) Fail to write f0."
1636 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1637 [ $size -eq 1048576 ] ||
1638 error "(6) guard size should be 1048576, but got $size"
1640 run_test 17 "LFSCK can repair multiple references"
1644 echo "The target MDT-object is there, but related stripe information"
1645 echo "is lost or partly lost. The LFSCK should regenerate the missed"
1646 echo "layout EA entries."
1649 check_mount_and_prep
1650 $LFS mkdir -i 0 $DIR/$tdir/a1
1651 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1652 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1654 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1656 $LFS path2fid $DIR/$tdir/a1/f1
1657 $LFS getstripe $DIR/$tdir/a1/f1
1659 if [ $MDSCOUNT -ge 2 ]; then
1660 $LFS mkdir -i 1 $DIR/$tdir/a2
1661 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1662 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1663 $LFS path2fid $DIR/$tdir/a2/f2
1664 $LFS getstripe $DIR/$tdir/a2/f2
1667 cancel_lru_locks osc
1669 echo "Inject failure, to make the MDT-object lost its layout EA"
1670 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1671 do_facet mds1 $LCTL set_param fail_loc=0x1615
1672 chown 1.1 $DIR/$tdir/a1/f1
1674 if [ $MDSCOUNT -ge 2 ]; then
1675 do_facet mds2 $LCTL set_param fail_loc=0x1615
1676 chown 1.1 $DIR/$tdir/a2/f2
1682 do_facet mds1 $LCTL set_param fail_loc=0
1683 if [ $MDSCOUNT -ge 2 ]; then
1684 do_facet mds2 $LCTL set_param fail_loc=0
1687 cancel_lru_locks mdc
1688 cancel_lru_locks osc
1690 echo "The file size should be incorrect since layout EA is lost"
1691 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1692 [ "$cur_size" != "$saved_size" ] ||
1693 error "(1) Expect incorrect file1 size"
1695 if [ $MDSCOUNT -ge 2 ]; then
1696 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1697 [ "$cur_size" != "$saved_size" ] ||
1698 error "(2) Expect incorrect file2 size"
1701 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1702 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1704 for k in $(seq $MDSCOUNT); do
1705 # The LFSCK status query internal is 30 seconds. For the case
1706 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1707 # time to guarantee the status sync up.
1708 wait_update_facet mds${k} "$LCTL get_param -n \
1709 mdd.$(facet_svc mds${k}).lfsck_layout |
1710 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1711 error "(4) MDS${k} is not the expected 'completed'"
1714 for k in $(seq $OSTCOUNT); do
1715 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1716 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1717 awk '/^status/ { print $2 }')
1718 [ "$cur_status" == "completed" ] ||
1719 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1722 local repaired=$(do_facet mds1 $LCTL get_param -n \
1723 mdd.$(facet_svc mds1).lfsck_layout |
1724 awk '/^repaired_orphan/ { print $2 }')
1725 [ $repaired -eq 1 ] ||
1726 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1728 if [ $MDSCOUNT -ge 2 ]; then
1729 repaired=$(do_facet mds2 $LCTL get_param -n \
1730 mdd.$(facet_svc mds2).lfsck_layout |
1731 awk '/^repaired_orphan/ { print $2 }')
1732 [ $repaired -eq 2 ] ||
1733 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1736 $LFS path2fid $DIR/$tdir/a1/f1
1737 $LFS getstripe $DIR/$tdir/a1/f1
1739 if [ $MDSCOUNT -ge 2 ]; then
1740 $LFS path2fid $DIR/$tdir/a2/f2
1741 $LFS getstripe $DIR/$tdir/a2/f2
1744 echo "The file size should be correct after layout LFSCK scanning"
1745 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1746 [ "$cur_size" == "$saved_size" ] ||
1747 error "(7) Expect file1 size $saved_size, but got $cur_size"
1749 if [ $MDSCOUNT -ge 2 ]; then
1750 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1751 [ "$cur_size" == "$saved_size" ] ||
1752 error "(8) Expect file2 size $saved_size, but got $cur_size"
1755 run_test 18a "Find out orphan OST-object and repair it (1)"
1759 echo "The target MDT-object is lost. The LFSCK should re-create the"
1760 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1761 echo "can move it back to normal namespace manually."
1764 check_mount_and_prep
1765 $LFS mkdir -i 0 $DIR/$tdir/a1
1766 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1767 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1768 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1769 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1771 $LFS getstripe $DIR/$tdir/a1/f1
1773 if [ $MDSCOUNT -ge 2 ]; then
1774 $LFS mkdir -i 1 $DIR/$tdir/a2
1775 $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
1776 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1777 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1779 $LFS getstripe $DIR/$tdir/a2/f2
1782 cancel_lru_locks osc
1784 echo "Inject failure, to simulate the case of missing the MDT-object"
1785 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1786 do_facet mds1 $LCTL set_param fail_loc=0x1616
1787 rm -f $DIR/$tdir/a1/f1
1789 if [ $MDSCOUNT -ge 2 ]; then
1790 do_facet mds2 $LCTL set_param fail_loc=0x1616
1791 rm -f $DIR/$tdir/a2/f2
1797 do_facet mds1 $LCTL set_param fail_loc=0
1798 if [ $MDSCOUNT -ge 2 ]; then
1799 do_facet mds2 $LCTL set_param fail_loc=0
1802 cancel_lru_locks mdc
1803 cancel_lru_locks osc
1805 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1806 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1808 for k in $(seq $MDSCOUNT); do
1809 # The LFSCK status query internal is 30 seconds. For the case
1810 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1811 # time to guarantee the status sync up.
1812 wait_update_facet mds${k} "$LCTL get_param -n \
1813 mdd.$(facet_svc mds${k}).lfsck_layout |
1814 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1815 error "(2) MDS${k} is not the expected 'completed'"
1818 for k in $(seq $OSTCOUNT); do
1819 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1820 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1821 awk '/^status/ { print $2 }')
1822 [ "$cur_status" == "completed" ] ||
1823 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1826 local repaired=$(do_facet mds1 $LCTL get_param -n \
1827 mdd.$(facet_svc mds1).lfsck_layout |
1828 awk '/^repaired_orphan/ { print $2 }')
1829 [ $repaired -eq 1 ] ||
1830 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1832 if [ $MDSCOUNT -ge 2 ]; then
1833 repaired=$(do_facet mds2 $LCTL get_param -n \
1834 mdd.$(facet_svc mds2).lfsck_layout |
1835 awk '/^repaired_orphan/ { print $2 }')
1836 [ $repaired -eq 2 ] ||
1837 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1840 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1841 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1842 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1844 if [ $MDSCOUNT -ge 2 ]; then
1845 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1846 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1849 $LFS path2fid $DIR/$tdir/a1/f1
1850 $LFS getstripe $DIR/$tdir/a1/f1
1852 if [ $MDSCOUNT -ge 2 ]; then
1853 $LFS path2fid $DIR/$tdir/a2/f2
1854 $LFS getstripe $DIR/$tdir/a2/f2
1857 echo "The file size should be correct after layout LFSCK scanning"
1858 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1859 [ "$cur_size" == "$saved_size" ] ||
1860 error "(7) Expect file1 size $saved_size, but got $cur_size"
1862 if [ $MDSCOUNT -ge 2 ]; then
1863 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1864 [ "$cur_size" == "$saved_size" ] ||
1865 error "(8) Expect file2 size $saved_size, but got $cur_size"
1868 run_test 18b "Find out orphan OST-object and repair it (2)"
1872 echo "The target MDT-object is lost, and the OST-object FID is missing."
1873 echo "The LFSCK should re-create the MDT-object with new FID under the "
1874 echo "directory .lustre/lost+found/MDTxxxx."
1877 check_mount_and_prep
1878 $LFS mkdir -i 0 $DIR/$tdir/a1
1879 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1881 echo "Inject failure, to simulate the case of missing parent FID"
1882 #define OBD_FAIL_LFSCK_NOPFID 0x1617
1883 do_facet ost1 $LCTL set_param fail_loc=0x1617
1885 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1886 $LFS getstripe $DIR/$tdir/a1/f1
1888 if [ $MDSCOUNT -ge 2 ]; then
1889 $LFS mkdir -i 1 $DIR/$tdir/a2
1890 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a2
1891 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1892 $LFS getstripe $DIR/$tdir/a2/f2
1895 cancel_lru_locks osc
1897 echo "Inject failure, to simulate the case of missing the MDT-object"
1898 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1899 do_facet mds1 $LCTL set_param fail_loc=0x1616
1900 rm -f $DIR/$tdir/a1/f1
1902 if [ $MDSCOUNT -ge 2 ]; then
1903 do_facet mds2 $LCTL set_param fail_loc=0x1616
1904 rm -f $DIR/$tdir/a2/f2
1910 do_facet mds1 $LCTL set_param fail_loc=0
1911 if [ $MDSCOUNT -ge 2 ]; then
1912 do_facet mds2 $LCTL set_param fail_loc=0
1915 cancel_lru_locks mdc
1916 cancel_lru_locks osc
1918 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1919 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1921 for k in $(seq $MDSCOUNT); do
1922 # The LFSCK status query internal is 30 seconds. For the case
1923 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1924 # time to guarantee the status sync up.
1925 wait_update_facet mds${k} "$LCTL get_param -n \
1926 mdd.$(facet_svc mds${k}).lfsck_layout |
1927 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1928 error "(2) MDS${k} is not the expected 'completed'"
1931 for k in $(seq $OSTCOUNT); do
1932 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1933 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1934 awk '/^status/ { print $2 }')
1935 [ "$cur_status" == "completed" ] ||
1936 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1939 if [ $MDSCOUNT -ge 2 ]; then
1945 local repaired=$(do_facet mds1 $LCTL get_param -n \
1946 mdd.$(facet_svc mds1).lfsck_layout |
1947 awk '/^repaired_orphan/ { print $2 }')
1948 [ $repaired -eq $expected ] ||
1949 error "(4) Expect $expected fixed on mds1, but got: $repaired"
1951 if [ $MDSCOUNT -ge 2 ]; then
1952 repaired=$(do_facet mds2 $LCTL get_param -n \
1953 mdd.$(facet_svc mds2).lfsck_layout |
1954 awk '/^repaired_orphan/ { print $2 }')
1955 [ $repaired -eq 0 ] ||
1956 error "(5) Expect 0 fixed on mds2, but got: $repaired"
1959 ls -ail $MOUNT/.lustre/lost+found/
1961 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
1962 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
1963 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
1965 error "(6) .lustre/lost+found/MDT0001/ should be empty"
1968 echo "There should be some stub under .lustre/lost+found/MDT0000/"
1969 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
1970 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
1972 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
1973 [ ! -z "$cname" ] ||
1974 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
1976 run_test 18c "Find out orphan OST-object and repair it (3)"
1980 echo "The target MDT-object layout EA slot is occpuied by some new"
1981 echo "created OST-object when repair dangling reference case. Such"
1982 echo "conflict OST-object has never been modified. Then when found"
1983 echo "the orphan OST-object, LFSCK will replace it with the orphan"
1987 check_mount_and_prep
1989 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
1990 echo "guard" > $DIR/$tdir/a1/f1
1991 echo "foo" > $DIR/$tdir/a1/f2
1992 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
1993 $LFS path2fid $DIR/$tdir/a1/f1
1994 $LFS getstripe $DIR/$tdir/a1/f1
1995 $LFS path2fid $DIR/$tdir/a1/f2
1996 $LFS getstripe $DIR/$tdir/a1/f2
1997 cancel_lru_locks osc
1999 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2000 echo "to reference the same OST-object (which is f1's OST-obejct)."
2001 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2002 echo "dangling reference case, but f2's old OST-object is there."
2005 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2007 chown 1.1 $DIR/$tdir/a1/f2
2008 rm -f $DIR/$tdir/a1/f1
2011 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2013 echo "stopall to cleanup object cache"
2016 setupall > /dev/null
2018 echo "The file size should be incorrect since dangling referenced"
2019 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2020 [ "$cur_size" != "$saved_size" ] ||
2021 error "(1) Expect incorrect file2 size"
2023 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2024 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2026 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2027 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2029 wait_update_facet mds1 "$LCTL get_param -n \
2030 mdd.$(facet_svc mds1).lfsck_layout |
2031 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
2032 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2034 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2036 for k in $(seq $MDSCOUNT); do
2037 # The LFSCK status query internal is 30 seconds. For the case
2038 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2039 # time to guarantee the status sync up.
2040 wait_update_facet mds${k} "$LCTL get_param -n \
2041 mdd.$(facet_svc mds${k}).lfsck_layout |
2042 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2043 error "(3) MDS${k} is not the expected 'completed'"
2046 for k in $(seq $OSTCOUNT); do
2047 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2048 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2049 awk '/^status/ { print $2 }')
2050 [ "$cur_status" == "completed" ] ||
2051 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2054 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2055 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2056 awk '/^repaired_orphan/ { print $2 }')
2057 [ $repaired -eq 1 ] ||
2058 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2060 echo "The file size should be correct after layout LFSCK scanning"
2061 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2062 [ "$cur_size" == "$saved_size" ] ||
2063 error "(6) Expect file2 size $saved_size, but got $cur_size"
2065 echo "The LFSCK should find back the original data."
2066 cat $DIR/$tdir/a1/f2
2067 $LFS path2fid $DIR/$tdir/a1/f2
2068 $LFS getstripe $DIR/$tdir/a1/f2
2070 run_test 18d "Find out orphan OST-object and repair it (4)"
2074 echo "The target MDT-object layout EA slot is occpuied by some new"
2075 echo "created OST-object when repair dangling reference case. Such"
2076 echo "conflict OST-object has been modified by others. To keep the"
2077 echo "new data, the LFSCK will create a new file to refernece this"
2078 echo "old orphan OST-object."
2081 check_mount_and_prep
2083 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2084 echo "guard" > $DIR/$tdir/a1/f1
2085 echo "foo" > $DIR/$tdir/a1/f2
2086 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2087 $LFS path2fid $DIR/$tdir/a1/f1
2088 $LFS getstripe $DIR/$tdir/a1/f1
2089 $LFS path2fid $DIR/$tdir/a1/f2
2090 $LFS getstripe $DIR/$tdir/a1/f2
2091 cancel_lru_locks osc
2093 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2094 echo "to reference the same OST-object (which is f1's OST-obejct)."
2095 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2096 echo "dangling reference case, but f2's old OST-object is there."
2099 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2101 chown 1.1 $DIR/$tdir/a1/f2
2102 rm -f $DIR/$tdir/a1/f1
2105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2107 echo "stopall to cleanup object cache"
2110 setupall > /dev/null
2112 echo "The file size should be incorrect since dangling referenced"
2113 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2114 [ "$cur_size" != "$saved_size" ] ||
2115 error "(1) Expect incorrect file2 size"
2117 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2118 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2120 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2121 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2123 wait_update_facet mds1 "$LCTL get_param -n \
2124 mdd.$(facet_svc mds1).lfsck_layout |
2125 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 ||
2126 error "(3) MDS1 is not the expected 'scanning-phase2'"
2128 # to guarantee all updates are synced.
2132 echo "Write new data to f2 to modify the new created OST-object."
2133 echo "dummy" >> $DIR/$tdir/a1/f2
2135 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2137 for k in $(seq $MDSCOUNT); do
2138 # The LFSCK status query internal is 30 seconds. For the case
2139 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2140 # time to guarantee the status sync up.
2141 wait_update_facet mds${k} "$LCTL get_param -n \
2142 mdd.$(facet_svc mds${k}).lfsck_layout |
2143 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2144 error "(4) MDS${k} is not the expected 'completed'"
2147 for k in $(seq $OSTCOUNT); do
2148 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2149 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2150 awk '/^status/ { print $2 }')
2151 [ "$cur_status" == "completed" ] ||
2152 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2155 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2156 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2157 awk '/^repaired_orphan/ { print $2 }')
2158 [ $repaired -eq 1 ] ||
2159 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2161 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2162 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2163 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2165 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2166 [ ! -z "$cname" ] ||
2167 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2169 echo "The stub file should keep the original f2 data"
2170 cur_size=$(ls -il $cname | awk '{ print $6 }')
2171 [ "$cur_size" == "$saved_size" ] ||
2172 error "(9) Expect file2 size $saved_size, but got $cur_size"
2175 $LFS path2fid $cname
2176 $LFS getstripe $cname
2178 echo "The f2 should contains new data."
2179 cat $DIR/$tdir/a1/f2
2180 $LFS path2fid $DIR/$tdir/a1/f2
2181 $LFS getstripe $DIR/$tdir/a1/f2
2183 run_test 18e "Find out orphan OST-object and repair it (5)"
2186 [ $OSTCOUNT -lt 2 ] &&
2187 skip "The test needs at least 2 OSTs" && return
2190 echo "The target MDT-object is lost. The LFSCK should re-create the"
2191 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2192 echo "to verify some OST-object(s) during the first stage-scanning,"
2193 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2194 echo "should not be affected."
2197 check_mount_and_prep
2198 $LFS mkdir -i 0 $DIR/$tdir/a1
2199 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
2200 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2201 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2202 $LFS mkdir -i 0 $DIR/$tdir/a2
2203 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a2
2204 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2205 $LFS getstripe $DIR/$tdir/a1/f1
2206 $LFS getstripe $DIR/$tdir/a2/f2
2208 if [ $MDSCOUNT -ge 2 ]; then
2209 $LFS mkdir -i 1 $DIR/$tdir/a3
2210 $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a3
2211 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2212 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2213 $LFS mkdir -i 1 $DIR/$tdir/a4
2214 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a4
2215 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2216 $LFS getstripe $DIR/$tdir/a3/f3
2217 $LFS getstripe $DIR/$tdir/a4/f4
2220 cancel_lru_locks osc
2222 echo "Inject failure, to simulate the case of missing the MDT-object"
2223 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2224 do_facet mds1 $LCTL set_param fail_loc=0x1616
2225 rm -f $DIR/$tdir/a1/f1
2226 rm -f $DIR/$tdir/a2/f2
2228 if [ $MDSCOUNT -ge 2 ]; then
2229 do_facet mds2 $LCTL set_param fail_loc=0x1616
2230 rm -f $DIR/$tdir/a3/f3
2231 rm -f $DIR/$tdir/a4/f4
2237 do_facet mds1 $LCTL set_param fail_loc=0
2238 if [ $MDSCOUNT -ge 2 ]; then
2239 do_facet mds2 $LCTL set_param fail_loc=0
2242 cancel_lru_locks mdc
2243 cancel_lru_locks osc
2245 echo "Inject failure, to simulate the OST0 fail to handle"
2246 echo "MDT0 LFSCK request during the first-stage scanning."
2247 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2248 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2250 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2251 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2253 for k in $(seq $MDSCOUNT); do
2254 # The LFSCK status query internal is 30 seconds. For the case
2255 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2256 # time to guarantee the status sync up.
2257 wait_update_facet mds${k} "$LCTL get_param -n \
2258 mdd.$(facet_svc mds${k}).lfsck_layout |
2259 awk '/^status/ { print \\\$2 }'" "partial" 32 ||
2260 error "(2) MDS${k} is not the expected 'partial'"
2263 wait_update_facet ost1 "$LCTL get_param -n \
2264 obdfilter.$(facet_svc ost1).lfsck_layout |
2265 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
2266 error "(3) OST1 is not the expected 'partial'"
2269 wait_update_facet ost2 "$LCTL get_param -n \
2270 obdfilter.$(facet_svc ost2).lfsck_layout |
2271 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2272 error "(4) OST2 is not the expected 'completed'"
2275 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2277 local repaired=$(do_facet mds1 $LCTL get_param -n \
2278 mdd.$(facet_svc mds1).lfsck_layout |
2279 awk '/^repaired_orphan/ { print $2 }')
2280 [ $repaired -eq 1 ] ||
2281 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2283 if [ $MDSCOUNT -ge 2 ]; then
2284 repaired=$(do_facet mds2 $LCTL get_param -n \
2285 mdd.$(facet_svc mds2).lfsck_layout |
2286 awk '/^repaired_orphan/ { print $2 }')
2287 [ $repaired -eq 1 ] ||
2288 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2291 echo "Trigger layout LFSCK on all devices again to cleanup"
2292 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2294 for k in $(seq $MDSCOUNT); do
2295 # The LFSCK status query internal is 30 seconds. For the case
2296 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2297 # time to guarantee the status sync up.
2298 wait_update_facet mds${k} "$LCTL get_param -n \
2299 mdd.$(facet_svc mds${k}).lfsck_layout |
2300 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2301 error "(8) MDS${k} is not the expected 'completed'"
2304 for k in $(seq $OSTCOUNT); do
2305 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2306 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2307 awk '/^status/ { print $2 }')
2308 [ "$cur_status" == "completed" ] ||
2309 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2313 local repaired=$(do_facet mds1 $LCTL get_param -n \
2314 mdd.$(facet_svc mds1).lfsck_layout |
2315 awk '/^repaired_orphan/ { print $2 }')
2316 [ $repaired -eq 2 ] ||
2317 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2319 if [ $MDSCOUNT -ge 2 ]; then
2320 repaired=$(do_facet mds2 $LCTL get_param -n \
2321 mdd.$(facet_svc mds2).lfsck_layout |
2322 awk '/^repaired_orphan/ { print $2 }')
2323 [ $repaired -eq 2 ] ||
2324 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2327 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2330 check_mount_and_prep
2331 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2333 echo "foo" > $DIR/$tdir/a0
2334 echo "guard" > $DIR/$tdir/a1
2335 cancel_lru_locks osc
2337 echo "Inject failure, then client will offer wrong parent FID when read"
2338 do_facet ost1 $LCTL set_param -n \
2339 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2340 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2341 $LCTL set_param fail_loc=0x1619
2343 echo "Read RPC with wrong parent FID should be denied"
2344 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2345 $LCTL set_param fail_loc=0
2347 run_test 19a "OST-object inconsistency self detect"
2350 check_mount_and_prep
2351 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2353 echo "Inject failure stub to make the OST-object to back point to"
2354 echo "non-exist MDT-object"
2356 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2357 do_facet ost1 $LCTL set_param fail_loc=0x1611
2358 echo "foo" > $DIR/$tdir/f0
2359 cancel_lru_locks osc
2360 do_facet ost1 $LCTL set_param fail_loc=0
2362 echo "Nothing should be fixed since self detect and repair is disabled"
2363 local repaired=$(do_facet ost1 $LCTL get_param -n \
2364 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2365 awk '/^repaired/ { print $2 }')
2366 [ $repaired -eq 0 ] ||
2367 error "(1) Expected 0 repaired, but got $repaired"
2369 echo "Read RPC with right parent FID should be accepted,"
2370 echo "and cause parent FID on OST to be fixed"
2372 do_facet ost1 $LCTL set_param -n \
2373 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2374 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2376 repaired=$(do_facet ost1 $LCTL get_param -n \
2377 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2378 awk '/^repaired/ { print $2 }')
2379 [ $repaired -eq 1 ] ||
2380 error "(3) Expected 1 repaired, but got $repaired"
2382 run_test 19b "OST-object inconsistency self repair"
2385 [ $OSTCOUNT -lt 2 ] &&
2386 skip "The test needs at least 2 OSTs" && return
2389 echo "The target MDT-object and some of its OST-object are lost."
2390 echo "The LFSCK should find out the left OST-objects and re-create"
2391 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2392 echo "with the partial OST-objects (LOV EA hole)."
2394 echo "New client can access the file with LOV EA hole via normal"
2395 echo "system tools or commands without crash the system."
2397 echo "For old client, even though it cannot access the file with"
2398 echo "LOV EA hole, it should not cause the system crash."
2401 check_mount_and_prep
2402 $LFS mkdir -i 0 $DIR/$tdir/a1
2403 if [ $OSTCOUNT -gt 2 ]; then
2404 $LFS setstripe -c 3 -i 0 -s 1M $DIR/$tdir/a1
2407 $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a1
2411 # 256 blocks on the stripe0.
2412 # 1 block on the stripe1 for 2 OSTs case.
2413 # 256 blocks on the stripe1 for other cases.
2414 # 1 block on the stripe2 if OSTs > 2
2415 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2416 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2417 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2419 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2420 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2421 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2424 $LFS getstripe $DIR/$tdir/a1/f0
2426 $LFS getstripe $DIR/$tdir/a1/f1
2428 $LFS getstripe $DIR/$tdir/a1/f2
2430 if [ $OSTCOUNT -gt 2 ]; then
2431 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2432 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2434 $LFS getstripe $DIR/$tdir/a1/f3
2437 cancel_lru_locks osc
2439 echo "Inject failure..."
2440 echo "To simulate f0 lost MDT-object"
2441 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2442 do_facet mds1 $LCTL set_param fail_loc=0x1616
2443 rm -f $DIR/$tdir/a1/f0
2445 echo "To simulate f1 lost MDT-object and OST-object0"
2446 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2447 do_facet mds1 $LCTL set_param fail_loc=0x161a
2448 rm -f $DIR/$tdir/a1/f1
2450 echo "To simulate f2 lost MDT-object and OST-object1"
2451 do_facet mds1 $LCTL set_param fail_val=1
2452 rm -f $DIR/$tdir/a1/f2
2454 if [ $OSTCOUNT -gt 2 ]; then
2455 echo "To simulate f3 lost MDT-object and OST-object2"
2456 do_facet mds1 $LCTL set_param fail_val=2
2457 rm -f $DIR/$tdir/a1/f3
2460 umount_client $MOUNT
2463 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2465 echo "Inject failure to slow down the LFSCK on OST0"
2466 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2467 do_facet ost1 $LCTL set_param fail_loc=0x161b
2469 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2470 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2473 do_facet ost1 $LCTL set_param fail_loc=0
2475 for k in $(seq $MDSCOUNT); do
2476 # The LFSCK status query internal is 30 seconds. For the case
2477 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2478 # time to guarantee the status sync up.
2479 wait_update_facet mds${k} "$LCTL get_param -n \
2480 mdd.$(facet_svc mds${k}).lfsck_layout |
2481 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2482 error "(2) MDS${k} is not the expected 'completed'"
2485 for k in $(seq $OSTCOUNT); do
2486 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2487 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2488 awk '/^status/ { print $2 }')
2489 [ "$cur_status" == "completed" ] ||
2490 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2493 local repaired=$(do_facet mds1 $LCTL get_param -n \
2494 mdd.$(facet_svc mds1).lfsck_layout |
2495 awk '/^repaired_orphan/ { print $2 }')
2496 if [ $OSTCOUNT -gt 2 ]; then
2497 [ $repaired -eq 9 ] ||
2498 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2500 [ $repaired -eq 4 ] ||
2501 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2504 mount_client $MOUNT || error "(5.0) Fail to start client!"
2506 LOV_PATTERN_F_HOLE=0x40000000
2509 # ${fid0}-R-0 is the old f0
2511 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2512 echo "Check $name, which is the old f0"
2514 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2516 local pattern=0x$($LFS getstripe -L $name)
2517 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2518 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2520 local stripes=$($LFS getstripe -c $name)
2521 if [ $OSTCOUNT -gt 2 ]; then
2522 [ $stripes -eq 3 ] ||
2523 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2525 [ $stripes -eq 2 ] ||
2526 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2529 local size=$(stat $name | awk '/Size:/ { print $2 }')
2530 [ $size -eq $((4096 * $bcount)) ] ||
2531 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2533 cat $name > /dev/null || error "(5.5) cannot read $name"
2535 echo "dummy" >> $name || error "(5.6) cannot write $name"
2537 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2539 touch $name || error "(5.8) cannot touch $name"
2541 rm -f $name || error "(5.9) cannot unlink $name"
2544 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2546 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2547 if [ $OSTCOUNT -gt 2 ]; then
2548 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2550 echo "Check $name, it contains the old f1's stripe1"
2553 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2555 pattern=0x$($LFS getstripe -L $name)
2556 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2557 error "(6.2) expect pattern flag hole, but got $pattern"
2559 stripes=$($LFS getstripe -c $name)
2560 if [ $OSTCOUNT -gt 2 ]; then
2561 [ $stripes -eq 3 ] ||
2562 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2564 [ $stripes -eq 2 ] ||
2565 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2568 size=$(stat $name | awk '/Size:/ { print $2 }')
2569 [ $size -eq $((4096 * $bcount)) ] ||
2570 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2572 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2574 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2575 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2578 [ $failures -eq 256 ] ||
2579 error "(6.6) expect 256 IO failures, but get $failures"
2581 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2582 [ $size -eq $((4096 * $bcount)) ] ||
2583 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2585 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2586 error "(6.8) write to the LOV EA hole should fail"
2588 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2589 error "(6.9) write to normal stripe should NOT fail"
2591 echo "foo" >> $name && error "(6.10) append write $name should fail"
2593 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2595 touch $name || error "(6.12) cannot touch $name"
2597 rm -f $name || error "(6.13) cannot unlink $name"
2600 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2602 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2603 if [ $OSTCOUNT -gt 2 ]; then
2604 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2606 echo "Check $name, it contains the old f2's stripe0"
2609 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2611 pattern=0x$($LFS getstripe -L $name)
2612 stripes=$($LFS getstripe -c $name)
2613 size=$(stat $name | awk '/Size:/ { print $2 }')
2614 if [ $OSTCOUNT -gt 2 ]; then
2615 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2616 error "(7.2.1) expect pattern flag hole, but got $pattern"
2618 [ $stripes -eq 3 ] ||
2619 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2621 [ $size -eq $((4096 * $bcount)) ] ||
2622 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2624 cat $name > /dev/null &&
2625 error "(7.5.1) normal read $name should fail"
2627 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2628 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2630 [ $failures -eq 256 ] ||
2631 error "(7.6) expect 256 IO failures, but get $failures"
2633 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2634 [ $size -eq $((4096 * $bcount)) ] ||
2635 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2637 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2638 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2640 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2641 error "(7.8.1) write to normal stripe should NOT fail"
2643 echo "foo" >> $name &&
2644 error "(7.8.3) append write $name should fail"
2646 chown $RUNAS_ID:$RUNAS_GID $name ||
2647 error "(7.9.1) cannot chown on $name"
2649 touch $name || error "(7.10.1) cannot touch $name"
2651 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2652 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2654 [ $stripes -eq 1 ] ||
2655 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2658 [ $size -eq $((4096 * (256 + 0))) ] ||
2659 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2661 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2663 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2665 chown $RUNAS_ID:$RUNAS_GID $name ||
2666 error "(7.9.2) cannot chown on $name"
2668 touch $name || error "(7.10.2) cannot touch $name"
2671 rm -f $name || error "(7.11) cannot unlink $name"
2673 [ $OSTCOUNT -le 2 ] && return
2676 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2678 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2679 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2681 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2683 pattern=0x$($LFS getstripe -L $name)
2684 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2685 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2687 stripes=$($LFS getstripe -c $name)
2688 # LFSCK does not know the old f3 had 3 stripes.
2689 # It only tries to find as much as possible.
2690 # The stripe count depends on the last stripe's offset.
2691 [ $stripes -eq 2 ] ||
2692 error "(8.3) expect the stripe count is 2, but got $stripes"
2694 size=$(stat $name | awk '/Size:/ { print $2 }')
2696 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2697 error "(8.4) expect the size $((4096 * 512)), but got $size"
2699 cat $name > /dev/null || error "(8.5) cannot read $name"
2701 echo "dummy" >> $name || error "(8.6) cannot write $name"
2703 chown $RUNAS_ID:$RUNAS_GID $name ||
2704 error "(8.7) cannot chown on $name"
2706 touch $name || error "(8.8) cannot touch $name"
2708 rm -f $name || error "(8.9) cannot unlink $name"
2710 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2713 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2714 skip "ignore the test if MDS is older than 2.5.59" && return
2716 check_mount_and_prep
2717 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2719 echo "Start all LFSCK components by default (-s 1)"
2720 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2721 error "Fail to start LFSCK"
2723 echo "namespace LFSCK should be in 'scanning-phase1' status"
2724 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2725 [ "$STATUS" == "scanning-phase1" ] ||
2726 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2728 echo "layout LFSCK should be in 'scanning-phase1' status"
2729 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2730 [ "$STATUS" == "scanning-phase1" ] ||
2731 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2733 echo "Stop all LFSCK components by default"
2734 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2735 error "Fail to stop LFSCK"
2737 run_test 21 "run all LFSCK components by default"
2740 [ $MDSCOUNT -lt 2 ] &&
2741 skip "We need at least 2 MDSes for this test" && return
2744 echo "The parent_A references the child directory via some name entry,"
2745 echo "but the child directory back references another parent_B via its"
2746 echo "".." name entry. The parent_A does not exist. Then the namesapce"
2747 echo "LFSCK will repair the child directory's ".." name entry."
2750 check_mount_and_prep
2752 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2753 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2755 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2756 echo "The dummy's dotdot name entry references the guard."
2757 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2759 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2760 error "(3) Fail to mkdir on MDT0"
2761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2763 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2765 echo "Trigger namespace LFSCK to repair unmatched pairs"
2766 $START_NAMESPACE -A -r ||
2767 error "(5) Fail to start LFSCK for namespace"
2769 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2770 mdd.${MDT_DEV}.lfsck_namespace |
2771 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2773 error "(6) unexpected status"
2776 local repaired=$($SHOW_NAMESPACE |
2777 awk '/^unmatched_pairs_repaired/ { print $2 }')
2778 [ $repaired -eq 1 ] ||
2779 error "(7) Fail to repair unmatched pairs: $repaired"
2781 echo "'ls' should success after namespace LFSCK repairing"
2782 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2783 error "(8) ls should success."
2785 run_test 22a "LFSCK can repair unmatched pairs (1)"
2788 [ $MDSCOUNT -lt 2 ] &&
2789 skip "We need at least 2 MDSes for this test" && return
2792 echo "The parent_A references the child directory via the name entry_B,"
2793 echo "but the child directory back references another parent_C via its"
2794 echo "".." name entry. The parent_C exists, but there is no the name"
2795 echo "entry_B under the parent_B. Then the namesapce LFSCK will repair"
2796 echo "the child directory's ".." name entry and its linkEA."
2799 check_mount_and_prep
2801 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2802 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2804 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2805 echo "and bad linkEA. The dummy's dotdot name entry references the"
2806 echo "guard. The dummy's linkEA references n non-exist name entry."
2807 #define OBD_FAIL_LFSCK_BAD_PARENT2 0x161f
2808 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161f
2809 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2810 error "(3) Fail to mkdir on MDT0"
2811 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2813 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2814 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2815 local dummyname=$($LFS fid2path $DIR $dummyfid)
2816 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2817 error "(4) fid2path works unexpectedly."
2819 echo "Trigger namespace LFSCK to repair unmatched pairs"
2820 $START_NAMESPACE -A -r ||
2821 error "(5) Fail to start LFSCK for namespace"
2823 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2824 mdd.${MDT_DEV}.lfsck_namespace |
2825 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2827 error "(6) unexpected status"
2830 local repaired=$($SHOW_NAMESPACE |
2831 awk '/^unmatched_pairs_repaired/ { print $2 }')
2832 [ $repaired -eq 1 ] ||
2833 error "(7) Fail to repair unmatched pairs: $repaired"
2835 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2836 local dummyname=$($LFS fid2path $DIR $dummyfid)
2837 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2838 error "(8) fid2path does not work"
2840 run_test 22b "LFSCK can repair unmatched pairs (2)"
2843 [ $MDSCOUNT -lt 2 ] &&
2844 skip "We need at least 2 MDSes for this test" && return
2847 echo "The name entry is there, but the MDT-object for such name "
2848 echo "entry does not exist. The namespace LFSCK should find out "
2849 echo "and repair the inconsistency as required."
2852 check_mount_and_prep
2854 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2855 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
2857 echo "Inject failure stub on MDT1 to simulate dangling name entry"
2858 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
2859 do_facet mds2 $LCTL set_param fail_loc=0x1620
2860 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
2861 do_facet mds2 $LCTL set_param fail_loc=0
2863 echo "'ls' should fail because of dangling name entry"
2864 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
2866 echo "Trigger namespace LFSCK to find out dangling name entry"
2867 $START_NAMESPACE -A -r ||
2868 error "(5) Fail to start LFSCK for namespace"
2870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2871 mdd.${MDT_DEV}.lfsck_namespace |
2872 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2874 error "(6) unexpected status"
2877 local repaired=$($SHOW_NAMESPACE |
2878 awk '/^dangling_repaired/ { print $2 }')
2879 [ $repaired -eq 1 ] ||
2880 error "(7) Fail to repair dangling name entry: $repaired"
2882 echo "'ls' should fail because not re-create MDT-object by default"
2883 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
2885 echo "Trigger namespace LFSCK again to repair dangling name entry"
2886 $START_NAMESPACE -A -r -C ||
2887 error "(9) Fail to start LFSCK for namespace"
2889 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2890 mdd.${MDT_DEV}.lfsck_namespace |
2891 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2893 error "(10) unexpected status"
2896 repaired=$($SHOW_NAMESPACE |
2897 awk '/^dangling_repaired/ { print $2 }')
2898 [ $repaired -eq 1 ] ||
2899 error "(11) Fail to repair dangling name entry: $repaired"
2901 echo "'ls' should success after namespace LFSCK repairing"
2902 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
2904 run_test 23a "LFSCK can repair dangling name entry (1)"
2908 echo "The objectA has multiple hard links, one of them corresponding"
2909 echo "to the name entry_B. But there is something wrong for the name"
2910 echo "entry_B and cause entry_B to references non-exist object_C."
2911 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2912 echo "as dangling, and re-create the lost object_C. When the LFSCK"
2913 echo "comes to the second-stage scanning, it will find that the"
2914 echo "former re-creating object_C is not proper, and will try to"
2915 echo "replace the object_C with the real object_A."
2918 check_mount_and_prep
2920 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2921 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2922 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2924 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2925 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2927 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2928 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2930 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2932 echo "'ls' should fail because of dangling name entry"
2933 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2934 error "(6) ls should fail."
2936 echo "Trigger namespace LFSCK to find out dangling name entry"
2937 $START_NAMESPACE -r -C ||
2938 error "(7) Fail to start LFSCK for namespace"
2940 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2941 mdd.${MDT_DEV}.lfsck_namespace |
2942 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2944 error "(8) unexpected status"
2947 local repaired=$($SHOW_NAMESPACE |
2948 awk '/^dangling_repaired/ { print $2 }')
2949 [ $repaired -eq 1 ] ||
2950 error "(9) Fail to repair dangling name entry: $repaired"
2952 repaired=$($SHOW_NAMESPACE |
2953 awk '/^multiple_linked_repaired/ { print $2 }')
2954 [ $repaired -eq 1 ] ||
2955 error "(10) Fail to drop the former created object: $repaired"
2957 local data=$(cat $DIR/$tdir/d0/foo)
2958 [ "$data" == "dummy" ] ||
2959 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
2961 run_test 23b "LFSCK can repair dangling name entry (2)"
2965 echo "The objectA has multiple hard links, one of them corresponding"
2966 echo "to the name entry_B. But there is something wrong for the name"
2967 echo "entry_B and cause entry_B to references non-exist object_C."
2968 echo "In the first-stage scanning, the LFSCK will think the entry_B"
2969 echo "as dangling, and re-create the lost object_C. And then others"
2970 echo "modified the re-created object_C. When the LFSCK comes to the"
2971 echo "second-stage scanning, it will find that the former re-creating"
2972 echo "object_C maybe wrong and try to replace the object_C with the"
2973 echo "real object_A. But because object_C has been modified, so the"
2974 echo "LFSCK cannot replace it."
2977 check_mount_and_prep
2979 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2980 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
2981 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
2983 echo "Inject failure stub on MDT0 to simulate dangling name entry"
2984 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
2985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
2986 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
2987 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2989 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
2991 echo "'ls' should fail because of dangling name entry"
2992 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
2993 error "(6) ls should fail."
2995 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2996 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2998 echo "Trigger namespace LFSCK to find out dangling name entry"
2999 $START_NAMESPACE -r -C ||
3000 error "(7) Fail to start LFSCK for namespace"
3002 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3003 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3004 stat $DIR/$tdir/guard
3006 error "(8) unexpected size"
3009 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3010 cancel_lru_locks osc
3012 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3013 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3014 mdd.${MDT_DEV}.lfsck_namespace |
3015 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3017 error "(10) unexpected status"
3020 local repaired=$($SHOW_NAMESPACE |
3021 awk '/^dangling_repaired/ { print $2 }')
3022 [ $repaired -eq 1 ] ||
3023 error "(11) Fail to repair dangling name entry: $repaired"
3025 local data=$(cat $DIR/$tdir/d0/foo)
3026 [ "$data" != "dummy" ] ||
3027 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3029 run_test 23c "LFSCK can repair dangling name entry (3)"
3031 $LCTL set_param debug=-lfsck > /dev/null || true
3033 # restore MDS/OST size
3034 MDSSIZE=${SAVED_MDSSIZE}
3035 OSTSIZE=${SAVED_OSTSIZE}
3036 OSTCOUNT=${SAVED_OSTCOUNT}
3038 # cleanup the system at last