Whamcloud - gitweb
LU-7381 e2fsck: fix e2fsck -fD directory truncation
authorAndreas Dilger <andreas.dilger@intel.com>
Fri, 13 Nov 2015 05:35:17 +0000 (22:35 -0700)
committerAndreas Dilger <andreas.dilger@intel.com>
Tue, 10 May 2016 05:12:34 +0000 (23:12 -0600)
When an extent-mapped directory is compacted by "e2fsck -fD" and
frees enough leaf blocks that it loses an extent tree index block,
the old e2fsck_rehash_dir->ext2fs_block_iterate3->write_dir_block()
code would not free the extent block, which would result in the
extent tree becoming corrupted when it is written out.

    Pass 1: Checking inodes, blocks, and sizes
    Inode 17825800, end of extent exceeds allowed value
            (logical block 710, physical block 570459684, len 1019)

This results in loss of a whole index block of directory leaf blocks
and thousands or millions of files in lost+found.

Fix e2fsck_rehash_dir() to call ext2fs_punch() to free the blocks
at the end of the directory instead of trying to handle this itself
while writing out the directory.  That properly handles all of the
cases of updating the extent tree as well as accounting for blocks
that are released (both leaf blocks and index blocks).

Add a test case for compacting the directory to be smaller than the
index block that originally caused the corruption.

e2fsprogs-commit: 19961cd0003564c63c33ec14e69dfec6d81a2238
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Change-Id: I2e075849423693ebf4468fd7b0f41d6b2f500c1e
Reviewed-on: http://review.whamcloud.com/17153
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
e2fsck/rehash.c
tests/f_extent_htree/expect.1 [new file with mode: 0644]
tests/f_extent_htree/expect.2 [new file with mode: 0644]
tests/f_extent_htree/image.gz [new file with mode: 0644]
tests/f_extent_htree/name [new file with mode: 0644]
tests/f_extent_htree/script [new file with mode: 0644]
tests/f_h_badnode/expect.1
tests/f_h_badnode/expect.2

index b544370..60967f6 100644 (file)
 #include "e2fsck.h"
 #include "problem.h"
 
+#undef REHASH_DEBUG
+
 struct fill_dir_struct {
        char *buf;
        struct ext2_inode *inode;
        errcode_t err;
+       ext2_ino_t ino;
        e2fsck_t ctx;
        struct hash_entry *harray;
        int max_array, num_array;
@@ -639,8 +642,8 @@ static errcode_t calculate_tree(ext2_filsys fs,
 struct write_dir_struct {
        struct out_dir *outdir;
        errcode_t       err;
+       ext2_ino_t      ino;
        e2fsck_t        ctx;
-       blk64_t         cleared;
 };
 
 /*
@@ -657,28 +660,35 @@ static int write_dir_block(ext2_filsys fs,
        blk64_t blk;
        char    *dir;
 
-       if (*block_nr == 0)
+#ifdef REHASH_DEBUG
+       printf("%u: write_dir_block %lld:%lld", wd->ino, blockcnt, *block_nr);
+#endif
+       if (*block_nr == 0) {
+#ifdef REHASH_DEBUG
+               printf(" - skip\n");
+#endif
                return 0;
+       }
+       /* Don't free blocks at the end of the directory, they will be
+        * truncated by the caller. */
        if (blockcnt >= wd->outdir->num) {
-               e2fsck_read_bitmaps(wd->ctx);
-               blk = *block_nr;
-               /*
-                * In theory, we only release blocks from the end of the
-                * directory file, so it's fine to clobber a whole cluster at
-                * once.
-                */
-               if (blk % EXT2FS_CLUSTER_RATIO(fs) == 0) {
-                       ext2fs_block_alloc_stats2(fs, blk, -1);
-                       wd->cleared++;
-               }
-               *block_nr = 0;
-               return BLOCK_CHANGED;
+#ifdef REHASH_DEBUG
+               printf(" - not freed\n");
+#endif
+               return 0;
        }
-       if (blockcnt < 0)
+       if (blockcnt < 0) {
+#ifdef REHASH_DEBUG
+               printf(" - skip\n");
+#endif
                return 0;
+       }
 
        dir = wd->outdir->buf + (blockcnt * fs->blocksize);
        wd->err = ext2fs_write_dir_block3(fs, *block_nr, dir, 0);
+#ifdef REHASH_DEBUG
+       printf(" - write (%d)\n", wd->err);
+#endif
        if (wd->err)
                return BLOCK_ABORT;
        return 0;
@@ -698,10 +708,10 @@ static errcode_t write_directory(e2fsck_t ctx, ext2_filsys fs,
 
        wd.outdir = outdir;
        wd.err = 0;
+       wd.ino = ino;
        wd.ctx = ctx;
-       wd.cleared = 0;
 
-       retval = ext2fs_block_iterate3(fs, ino, 0, 0,
+       retval = ext2fs_block_iterate3(fs, ino, 0, NULL,
                                       write_dir_block, &wd);
        if (retval)
                return retval;
@@ -713,14 +723,17 @@ static errcode_t write_directory(e2fsck_t ctx, ext2_filsys fs,
                inode.i_flags &= ~EXT2_INDEX_FL;
        else
                inode.i_flags |= EXT2_INDEX_FL;
-       retval = ext2fs_inode_size_set(fs, &inode,
-                                      outdir->num * fs->blocksize);
+#ifdef REHASH_DEBUG
+       printf("%u: set inode size to %u blocks = %u bytes\n",
+              ino, outdir->num, outdir->num * fs->blocksize);
+#endif
+       retval = ext2fs_inode_size_set(fs, &inode, (ext2_off64_t)outdir->num *
+                                                  fs->blocksize);
        if (retval)
                return retval;
-       ext2fs_iblk_sub_blocks(fs, &inode, wd.cleared);
-       e2fsck_write_inode(ctx, ino, &inode, "rehash_dir");
 
-       return 0;
+       /* ext2fs_punch() also calls ext2fs_write_inode() */
+       return ext2fs_punch(fs, ino, &inode, NULL, outdir->num, ~0ULL);
 }
 
 errcode_t e2fsck_rehash_dir(e2fsck_t ctx, ext2_ino_t ino)
@@ -729,32 +742,25 @@ errcode_t e2fsck_rehash_dir(e2fsck_t ctx, ext2_ino_t ino)
        errcode_t               retval;
        struct ext2_inode       inode;
        char                    *dir_buf = 0;
-       struct fill_dir_struct  fd;
-       struct out_dir          outdir;
+       struct fill_dir_struct  fd = { NULL };
+       struct out_dir          outdir = { 0 };
 
-       outdir.max = outdir.num = 0;
-       outdir.buf = 0;
-       outdir.hashes = 0;
        e2fsck_read_inode(ctx, ino, &inode, "rehash_dir");
 
        retval = ENOMEM;
-       fd.harray = 0;
        dir_buf = malloc(inode.i_size);
        if (!dir_buf)
                goto errout;
 
        fd.max_array = inode.i_size / 32;
-       fd.num_array = 0;
        fd.harray = malloc(fd.max_array * sizeof(struct hash_entry));
        if (!fd.harray)
                goto errout;
 
+       fd.ino = ino;
        fd.ctx = ctx;
        fd.buf = dir_buf;
        fd.inode = &inode;
-       fd.err = 0;
-       fd.dir_size = 0;
-       fd.compress = 0;
        if (!(fs->super->s_feature_compat & EXT2_FEATURE_COMPAT_DIR_INDEX) ||
            (inode.i_size / fs->blocksize) < 2)
                fd.compress = 1;
diff --git a/tests/f_extent_htree/expect.1 b/tests/f_extent_htree/expect.1
new file mode 100644 (file)
index 0000000..223ca69
--- /dev/null
@@ -0,0 +1,29 @@
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 3A: Optimizing directories
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+
+test_filesys: ***** FILE SYSTEM WAS MODIFIED *****
+
+         352 inodes used (41.12%, out of 856)
+           0 non-contiguous files (0.0%)
+           1 non-contiguous directory (0.3%)
+             # of inodes with ind/dind/tind blocks: 0/0/0
+             Extent depth histogram: 342/1
+         586 blocks used (68.94%, out of 850)
+           0 bad blocks
+           0 large files
+
+         340 regular files
+           3 directories
+           0 character device files
+           0 block device files
+           0 fifos
+           0 links
+           0 symbolic links (0 fast symbolic links)
+           0 sockets
+------------
+         343 files
+Exit status is 1
diff --git a/tests/f_extent_htree/expect.2 b/tests/f_extent_htree/expect.2
new file mode 100644 (file)
index 0000000..860b491
--- /dev/null
@@ -0,0 +1,7 @@
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 352/856 files (0.3% non-contiguous), 586/850 blocks
+Exit status is 0
diff --git a/tests/f_extent_htree/image.gz b/tests/f_extent_htree/image.gz
new file mode 100644 (file)
index 0000000..284207e
Binary files /dev/null and b/tests/f_extent_htree/image.gz differ
diff --git a/tests/f_extent_htree/name b/tests/f_extent_htree/name
new file mode 100644 (file)
index 0000000..fc3812d
--- /dev/null
@@ -0,0 +1 @@
+htree extent compression
diff --git a/tests/f_extent_htree/script b/tests/f_extent_htree/script
new file mode 100644 (file)
index 0000000..60854c6
--- /dev/null
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+FSCK_OPT="-fyvD"
+. $cmd_dir/run_e2fsck
+
+exit $?
+# This script depends on "mke2fs -d", which is only in master and not maint,
+# to populate the file directory tree poorly (namely that there are no
+# contiguous blocks in the directory leaf and the extent tree is large).
+
+# Once the "mke2fs -d" option is available on the "maint" branch, the
+# above few lines should be deleted, along with the "image.gz" file.
+
+TMPDIR=${TMPDIR:-"/tmp"}
+OUT=$test_name.log
+
+FSCK_OPT="-fyvD"
+SKIP_GUNZIP="true"
+
+NAMELEN=250
+SRC=$TMPDIR/$test_name.tmp
+SUB=subdir
+BASE=$SRC/$SUB/$(yes | tr -d '\n' | dd bs=$NAMELEN count=1 2> /dev/null)
+TMPFILE=${TMPFILE:-"$TMPDIR/image"}
+BSIZE=1024
+
+> $OUT
+mkdir -p $SRC/$SUB
+# calculate the number of files needed to create the directory extent tree
+# deep enough to exceed the in-inode index and spill into an index block.
+#
+# dirents per block * extents per block * (index blocks > i_blocks)
+NUM=$(((BSIZE / (NAMELEN + 8)) * (BSIZE / 12) * 2))
+# Create source files. Unfortunately hard links will be copied as links,
+# and blocks with only NULs will be turned into holes.
+if [ ! -f $BASE.1 ]; then
+       for N in $(seq $NUM); do
+               echo "foo" > $BASE.$N
+       done >> $OUT
+fi
+
+# make filesystem with enough inodes and blocks to hold all the test files
+> $TMPFILE
+NUM=$((NUM * 5 / 3))
+echo "mke2fs -b $BSIZE -O dir_index,extent -d$SRC -N$NUM $TMPFILE $NUM" >> $OUT
+$MKE2FS -b $BSIZE -O dir_index,extent -d$SRC -N$NUM $TMPFILE $NUM >> $OUT 2>&1
+rm -r $SRC
+
+# Run e2fsck to convert dir to htree before deleting the files, as mke2fs
+# doesn't do this.  Run second e2fsck to verify there is no corruption yet.
+(
+       EXP1=$test_dir/expect.pre.1
+       EXP2=$test_dir/expect.pre.2
+       OUT1=$test_name.pre.1.log
+       OUT2=$test_name.pre.2.log
+       DESCRIPTION="$(cat $test_dir/name) setup"
+       . $cmd_dir/run_e2fsck
+)
+
+# generate a list of filenames for debugfs to delete, one from each leaf block
+DELETE_LIST=$TMPDIR/delete.$$
+$DEBUGFS -c -R "htree subdir" $TMPFILE 2>> $OUT |
+       grep -A2 "Reading directory block" |
+       awk '/yyyyy/ { print "rm '$SUB'/"$4 }' > $DELETE_LIST
+$DEBUGFS -w -f $DELETE_LIST $TMPFILE >> $OUT 2>&1
+rm $DELETE_LIST
+cp $TMPFILE $TMPFILE.sav
+
+. $cmd_dir/run_e2fsck
index ce2adb3..95b1cee 100644 (file)
@@ -14,5 +14,5 @@ Pass 4: Checking reference counts
 Pass 5: Checking group summary information
 
 test_filesys: ***** FILE SYSTEM WAS MODIFIED *****
-test_filesys: 47730/100192 files (0.0% non-contiguous), 13551/31745 blocks
+test_filesys: 47730/100192 files (0.0% non-contiguous), 13550/31745 blocks
 Exit status is 1
index b9dadb7..65985d1 100644 (file)
@@ -3,5 +3,5 @@ Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
 Pass 5: Checking group summary information
-test_filesys: 47730/100192 files (0.0% non-contiguous), 13551/31745 blocks
+test_filesys: 47730/100192 files (0.0% non-contiguous), 13550/31745 blocks
 Exit status is 0