From: vitaly Date: Sat, 22 Apr 2006 14:59:49 +0000 (+0000) Subject: Merge b1_5 from b1_4 (20060421_1413) X-Git-Tag: v1_7_140~1^12~3^2 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=0648d10222093fb0522cf5f0622a66c728761d52;p=fs%2Flustre-release.git Merge b1_5 from b1_4 (20060421_1413) --- diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch index 657ecf4..b6439e6 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -2,7 +2,7 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c =================================================================== --- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 +++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 -@@ -0,0 +1,2347 @@ +@@ -0,0 +1,2353 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -176,9 +176,9 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -448,8 +448,12 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -506,8 +510,10 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2644,7 +2650,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h =================================================================== --- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 +++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2742,7 +2748,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2843,15 +2849,13 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch index 0ee8d28..9e78214 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -3,7 +3,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 +++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 -@@ -0,0 +1,2349 @@ +@@ -0,0 +1,2355 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -177,9 +177,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -449,8 +449,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -507,8 +511,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2634,7 +2640,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h =================================================================== --- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 +++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2732,7 +2738,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2833,15 +2839,13 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 56fe653..bd95c54 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2,7 +2,7 @@ Index: linux-stage/fs/ext3/extents.c =================================================================== --- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 +++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 -@@ -0,0 +1,2347 @@ +@@ -0,0 +1,2353 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -176,9 +176,9 @@ Index: linux-stage/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -448,8 +448,12 @@ Index: linux-stage/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -506,8 +510,10 @@ Index: linux-stage/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2629,7 +2635,7 @@ Index: linux-stage/include/linux/ext3_extents.h =================================================================== --- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 +++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2727,7 +2733,7 @@ Index: linux-stage/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2828,15 +2834,13 @@ Index: linux-stage/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 1d8a4af..2a64875 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -2570,7 +2570,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_old(handle, inode, block, count); + else { + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 0c2f445..70f4f8a 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -2565,7 +2565,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index 5ff3d3b..01e7387 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -2584,7 +2584,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch index bb9fc1b..0d360fa 100644 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch @@ -26,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -86,7 +86,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -97,7 +97,7 @@ Index: linux-2.6.7/fs/ext3/namei.c - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch index 62bf156..37cca81 100644 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -20,16 +20,16 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c } static int ext3_add_nondir(handle_t *handle, -@@ -1706,7 +1712,7 @@ +@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t struct ext3_dir_entry_2 * de; int err, retries = 0; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; retry: -@@ -1729,7 +1735,7 @@ +@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext3_bread (handle, inode, 0, 1, &err); if (!dir_block) { @@ -38,7 +38,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; -@@ -1761,7 +1767,7 @@ +@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode iput (inode); goto out_stop; } @@ -47,7 +47,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); -@@ -2026,10 +2032,10 @@ +@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode retval = ext3_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; @@ -62,7 +62,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c inode->i_version++; inode->i_nlink = 0; /* There's no need to set i_disksize: the fact that i_nlink is -@@ -2039,7 +2045,7 @@ +@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); @@ -71,7 +71,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); -@@ -2090,7 +2096,7 @@ +@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); @@ -80,27 +80,27 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c if (!inode->i_nlink) ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime; -@@ -2165,7 +2171,7 @@ +@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; retry: -@@ -2252,8 +2258,8 @@ +@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { -@@ -2310,7 +2316,7 @@ +@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode } if (new_inode) { @@ -109,7 +109,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c new_inode->i_ctime = CURRENT_TIME_SEC; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -@@ -2321,11 +2327,13 @@ +@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, dir_bh); @@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 3c37955..727f180 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -41,20 +41,152 @@ Details : When doing a write from a liblustre client, the client with RPCs. In all cases it would slow down the write because these RPCs are unnecessary. +Severity : enhancement +Bugzilla : 9340 +Description: allow number of MDS service threads to be changed at module load +Details : It is now possible to change the number of MDS service threads + running. Adding "options mds mds_num_threads=N" will set the + number of threads for the next time Lustre is restarted (assuming + the "mds" module is also reloaded at that time). The default + number of threads will stay the same, 32 for most systems. + +Severity : major +Frequency : rare +Bugzilla : 10300 +Description: OST crash if filesystem is unformatted or corrupt +Details : If an OST is started on a device that has never been formatted + or if the filesystem is corrupt and cannot even mount then the + error handling cleanup routines would dereference a NULL pointer. + +Severity : medium +Frequency : rare +Bugzilla : 10047 +Description: NULL pointer deref in llap_from_page. +Details : get_cache_page_nowait can return a page with NULL (or otherwise + incorrect) mapping if the page was truncated/reclaimed while it was + searched for. Check for this condition and skip such pages when + doing readahead. Introduce extra check to llap_from_page() to + verify page->mapping->host is non-NULL (so page is not anonymous). + +Severity : minor +Frequency : Sometimes when using sys_sendfile +Bugzilla : 7020 +Description: "page not covered by a lock" warnings from ll_readpage +Details : sendfile called ll_readpage without right page locks present. + Now we introduced ll_file_sendfile that does necessary locking + around call to generic_file_sendfile() much like we do in + ll_file_read(). + +Severity : medium +Frequency : with certain MDS communication failures at client mount time +Bugzilla : 10268 +Description: NULL pointer deref after failed client mount +Details : a client connection request may delayed by the network layer + and not be sent until after the PTLRPC layer has timed out the + request. If the client fails the mount immediately it will try + to clean up before the network times out the request. Add a + reference from the request import to the obd device and delay + the cleanup until the network drops the request. + +Severity : medium +Frequency : occasionally during client (re)connect +Bugzilla : 9387 +Description: assertion failure during client (re)connect +Details : processing a client connection request may be delayed by the + client or server longer than the client connect timeout. This + causes the client to resend the connection request. If the + original connection request is replied in this interval, the + client may trip an assertion failure in ptlrpc_connect_interpret() + which thought it would be the only running connect process. + +Severity : medium +Frequency : only with obd_echo servers and clients that are rebooted +Bugzilla : 10140 +Description: kernel BUG accessing uninitialized data structure +Details : When running an obd_echo server it did not start the ping_evictor + thread, and when a client was evicted an uninitialized data + structure was accessed. Start the ping_evictor in the RPC + service startup instead of the OBD startup. + +Severity : enhancement +Bugzilla : 10393 (patchless) +Description: Remove dependency on various unexported kernel interfaces. +Details : No longer need reparent_to_init, exit_mm, exit_files, + sock_getsockopt, filemap_populate, FMODE_EXEC, put_filp. + +Severity : minor +Frequency : rare (only users of deprecated and unsupported LDAP config) +Bugzilla : 9337 +Description: write_conf for zeroconf mount queried LDAP incorrectly for client +Details : LDAP apparently contains 'lustreName' attributes instead of + 'name'. A simple remapping of the name is sufficient. + +Severity : major +Frequency : rare (only with non-default dump_on_timeout debug enabled) +Bugzilla : 10397 +Description: waiting_locks_callback trips kernel BUG if client is evicted +Details : Running with the dump_on_timeout debug flag turned on makes + it possible that the waiting_locks_callback() can try to dump + the Lustre kernel debug logs from an interrupt handler. Defer + this log dumping to the expired_lock_main() thread. + +Severity : enhancement +Bugzilla : 10420 +Description: Support NFS exporting on 2.6 kernels. +Details : Implement non-rawops metadata methods for NFS server to use without + changing NFS server code. + +Severity : medium +Frequency : very rare (synthetic metadata workload only) +Bugzilla : 9974 +Description: two racing renames might cause an MDS thread to deadlock +Details : Running the "racer" program may cause one MDS thread to rename + a file from being the source of a rename to being the target of + a rename at exactly the same time that another thread is doing + so, and the second thread has already enqueued these locks after + doing a lookup of the target and is trying to relock them in + order. Ensure that we don't try to re-lock the same resource. + +Severity : major +Frequency : only very large systems with liblustre clients +Bugzilla : 7304 +Description: slow eviction of liblustre clients with the "evict_by_nid" RPC +Details : Use asynchronous set_info RPCs to send the "evict_by_nid" to + all OSTs in parallel. This allows the eviction of stale liblustre + clients to proceed much faster than if they were done in series, + and also offers similar improvements for other set_info RPCs. + +Severity : minor +Bugzilla : 10265 +Description: excessive CPU usage during initial read phase on client +Details : During the initial read phase on a client, it would agressively + retry readahead on the file, consuming too much CPU and impacting + performance (since 1.4.5.8). Improve the readahead algorithm + to avoid this, and also improve some other common cases (read + of small files in particular, where "small" is files smaller than + /proc/fs/lustre/llite/*/max_read_ahead_whole_mb, 2MB by default). + +Severity : minor +Bugzilla : 10450 +Description: MDS crash when receiving packet with unknown intent. +Details : Do not LBUG in unknown intent case, just return -EFAULT + + ------------------------------------------------------------------------------ 02-14-2006 Cluster File Systems, Inc. * version 1.4.6 * WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT - INTEROPERATE with older versions automatically. Please read the + INTEROPERATE with older versions automatically. Please read the user documentation before upgrading any part of a live system. * WARNING: Lustre networking configuration changes are required with this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 for details. * bug fixes - * Support for newer kernels: 2.6.9-22.0.2.EL (RHEL 4), - 2.6.5-7.244 (SLES 9) - same as 1.4.5.2. - 2.6.12.6 vanilla (kernel.org) + * Support for newer kernels: + 2.6.9-22.0.2.EL (RHEL 4), + 2.6.5-7.244 (SLES 9) - same as 1.4.5.2. + 2.6.12.6 vanilla (kernel.org) Severity : enhancement @@ -68,6 +200,17 @@ Details : LNET is new networking infrastructure for Lustre, it includes created for this new infrastructure. Severity : enhancement +Description: Introduced Access control lists +Details : clients can set ACLs on files and directories in order to have + more fine-grained permissions than the standard Unix UGO+RWX. + The MDS must be started with the "-o acl" mount option. + +Severity : enhancement +Description: Introduced filesystem quotas +Details : Administrators may now establish per-user quotas on the + filesystem. + +Severity : enhancement Bugzilla : 7982 Description: Configuration change for the XT3 The PTLLND is now used to run Lustre over Portals on the XT3 diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 1217643..638763a 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -443,6 +443,49 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL], +[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct file_operations fops; + &fops.unlocked_ioctl; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_FILEMAP_POPULATE], +[AC_MSG_CHECKING([for exported filemap_populate]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_D_ADD_UNIQUE], +[AC_MSG_CHECKING([for d_add_unique]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + d_add_unique(NULL, NULL); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique]) +],[ + AC_MSG_RESULT([no]) +]) +]) # # LC_PROG_LINUX @@ -469,6 +512,9 @@ LC_FUNC_DEV_SET_RDONLY LC_FUNC_FILEMAP_FDATAWRITE LC_STRUCT_STATFS LC_FUNC_PAGE_MAPPED +LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL +LC_FILEMAP_POPULATE +LC_D_ADD_UNIQUE ]) # diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index b28eb28..d35d750 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -99,6 +99,12 @@ typedef unsigned short umode_t; #define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c) #define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0) +#ifndef page_private +#define page_private(page) ((page)->private) +#define set_page_private(page, v) ((page)->private = (v)) +#endif + + static inline void inter_module_put(void *a) { return; @@ -472,6 +478,7 @@ struct iattr { time_t ia_ctime; unsigned int ia_attr_flags; }; +#define ll_iattr_struct iattr #define IT_OPEN 0x0001 #define IT_CREAT 0x0002 diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 51b8389..066cc20 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -31,6 +31,27 @@ #include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) +struct ll_iattr_struct { + struct iattr iattr; + unsigned int ia_attr_flags; +}; +#else +#define ll_iattr_struct iattr +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +#define UNLOCK_INODE_MUTEX(inode) do {mutex_unlock(&(inode)->i_mutex); } while(0) +#define LOCK_INODE_MUTEX(inode) do {mutex_lock(&(inode)->i_mutex); } while(0) +#define TRYLOCK_INODE_MUTEX(inode) mutex_trylock(&(inode)->i_mutex) +#define d_child d_u.d_child +#define d_rcu d_u.d_rcu +#else +#define UNLOCK_INODE_MUTEX(inode) do {up(&(inode)->i_sem); } while(0) +#define LOCK_INODE_MUTEX(inode) do {down(&(inode)->i_sem); } while(0) +#define TRYLOCK_INODE_MUTEX(inode) (!down_trylock(&(inode)->i_sem)) +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) #define NGROUPS_SMALL NGROUPS #define NGROUPS_PER_BLOCK ((int)(EXEC_PAGESIZE / sizeof(gid_t))) @@ -54,6 +75,15 @@ void groups_free(struct group_info *ginfo); #endif +#ifndef page_private +#define page_private(page) ((page)->private) +#define set_page_private(page, v) ((page)->private = (v)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) +#define gfp_t int +#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) #define lock_dentry(___dentry) spin_lock(&(___dentry)->d_lock) @@ -103,17 +133,6 @@ void groups_free(struct group_info *ginfo); #include -static inline void lustre_daemonize_helper(void) -{ - LASSERT(current->signal != NULL); - current->signal->session = 1; - if (current->group_leader) - current->group_leader->signal->pgrp = 1; - else - CERROR("we aren't group leader\n"); - current->signal->tty = NULL; -} - static inline int cleanup_group_info(void) { struct group_info *ginfo; @@ -132,12 +151,12 @@ static inline int cleanup_group_info(void) do { \ page_cache_get(page); \ SetPagePrivate(page); \ - page->private = (unsigned long)llap; \ + set_page_private(page, (unsigned long)llap); \ } while (0) #define __clear_page_ll_data(page) \ do { \ ClearPagePrivate(page); \ - page->private = 0; \ + set_page_private(page, 0); \ page_cache_release(page); \ } while(0) @@ -171,6 +190,7 @@ static inline int cleanup_group_info(void) #define ILOOKUP(sb, ino, test, data) ilookup4(sb, ino, test, data); #define DCACHE_DISCONNECTED DCACHE_NFSD_DISCONNECTED #define ll_dev_t int +#define old_encode_dev(dev) (dev) /* 2.5 uses hlists for some things, like the d_hash. we'll treat them * as 2.5 and let macros drop back.. */ @@ -248,15 +268,7 @@ static inline void ll_redirty_page(struct page *page) static inline void __d_drop(struct dentry *dentry) { - list_del(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_hash); -} - -static inline void lustre_daemonize_helper(void) -{ - current->session = 1; - current->pgrp = 1; - current->tty = NULL; + list_del_init(&dentry->d_hash); } static inline int cleanup_group_info(void) @@ -282,8 +294,8 @@ static inline void cond_resched(void) #define PDE(ii) ((ii)->u.generic_ip) #endif -#define __set_page_ll_data(page, llap) page->private = (unsigned long)llap -#define __clear_page_ll_data(page) page->private = 0 +#define __set_page_ll_data(page, llap) set_page_private(page, (unsigned long)llap) +#define __clear_page_ll_data(page) set_page_private(page, 0) #define PageWriteback(page) 0 #define set_page_writeback(page) do {} while (0) #define end_page_writeback(page) do {} while (0) @@ -314,13 +326,20 @@ static inline int page_mapped(struct page *page) } #endif /* !HAVE_PAGE_MAPPED */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)) +static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +{ + update_atime(dentry->d_inode); +} +#endif + static inline void file_accessed(struct file *file) { #ifdef O_NOATIME if (file->f_flags & O_NOATIME) return; #endif - update_atime(file->f_dentry->d_inode); + touch_atime(file->f_vfsmnt, file->f_dentry); } #endif /* end of 2.4 compat macros */ diff --git a/lustre/include/linux/lustre_debug.h b/lustre/include/linux/lustre_debug.h index 7081b37..db872a9 100644 --- a/lustre/include/linux/lustre_debug.h +++ b/lustre/include/linux/lustre_debug.h @@ -31,11 +31,11 @@ #define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\ fmt, page, page->mapping, page->index, (long)page->flags, \ - page_count(page), page->private, ## arg) + page_count(page), page_private(page), ## arg) #else #define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ CDEBUG(mask, "page %p index %lu priv %0lx: "\ - fmt, page, page->index, page->private, ## arg) + fmt, page, page->index, page_private(page), ## arg) #endif #endif diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 5358084..7bc0602 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -148,6 +148,8 @@ static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb) do { \ if (time_before(jiffies, start + 15 * HZ)) \ break; \ + else if (time_before(jiffies, start + 30 * HZ)) \ + CDEBUG(D_VFSTRACE,"slow %s %lus\n", msg,(jiffies-start)/HZ);\ else if (time_before(jiffies, start + timeout / 2 * HZ)) \ CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ); \ else \ diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index 0316cf7..816925a 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -107,9 +107,9 @@ static inline struct dentry *ll_lookup_one_len(const char *fid_name, { struct dentry *dchild; - down(&dparent->d_inode->i_sem); + LOCK_INODE_MUTEX(dparent->d_inode); dchild = lookup_one_len(fid_name, dparent, fid_namelen); - up(&dparent->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dparent->d_inode); if (IS_ERR(dchild) || dchild->d_inode == NULL) return dchild; diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index cc1ed9a..438402c 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -224,11 +224,13 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define OBD_CONNECT_TRANSNO 0x800ULL /* replay is sending initial transno */ #define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */ #define OBD_CONNECT_JOIN 0x2000ULL /* files can be concatenated */ +#define OBD_CONNECT_NODEVOH 0x8000ULL /* No open handle for special nodes */ /* also update obd_connect_names[] for lprocfs_rd_connect_flags() */ #define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ - OBD_CONNECT_IBITS | OBD_CONNECT_JOIN) + OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \ + OBD_CONNECT_NODEVOH) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX) @@ -713,9 +715,7 @@ extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa); #define FMODE_READ 00000001 #define FMODE_WRITE 00000002 #endif -#ifndef FMODE_EXEC -#define FMODE_EXEC 00000004 -#endif +#define MDS_FMODE_EXEC 00000004 #define MDS_OPEN_CREAT 00000100 #define MDS_OPEN_EXCL 00000200 #define MDS_OPEN_TRUNC 00001000 diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 85a0268..1d226ea 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -31,6 +31,8 @@ #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) #endif +struct obd_statfs; + #define LL_IOC_GETFLAGS _IOR ('f', 151, long) #define LL_IOC_SETFLAGS _IOW ('f', 152, long) #define LL_IOC_CLRFLAGS _IOW ('f', 153, long) @@ -50,6 +52,7 @@ #define LL_STATFS_LOV 2 #define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) #define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) #define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_mds_data *) diff --git a/lustre/include/lustre_ha.h b/lustre/include/lustre_ha.h index 5083b94..8377728 100644 --- a/lustre/include/lustre_ha.h +++ b/lustre/include/lustre_ha.h @@ -22,6 +22,6 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active); void ptlrpc_activate_import(struct obd_import *imp); void ptlrpc_deactivate_import(struct obd_import *imp); void ptlrpc_invalidate_import(struct obd_import *imp); -void ptlrpc_fail_import(struct obd_import *imp, int generation); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); #endif diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 16521d8..d172dec 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -65,15 +65,16 @@ struct obd_import { struct obd_device *imp_obd; cfs_waitq_t imp_recovery_waitq; - __u64 imp_last_replay_transno; + atomic_t imp_inflight; atomic_t imp_replay_inflight; enum lustre_imp_state imp_state; int imp_generation; __u32 imp_conn_cnt; - __u64 imp_max_transno; + int imp_last_generation_checked; + __u64 imp_last_replay_transno; __u64 imp_peer_committed_transno; - struct obd_uuid imp_target_uuid; /* XXX -> lustre_name */ + __u64 imp_last_transno_checked; struct lustre_handle imp_remote_handle; cfs_time_t imp_next_ping; /* jiffies */ @@ -93,6 +94,8 @@ struct obd_import { __u32 imp_connect_op; struct obd_connect_data imp_connect_data; __u64 imp_connect_flags_orig; + + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ }; typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 7a9292e..857c29d 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -85,14 +85,14 @@ */ #define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64) -#define LDLM_NBUFS 64 +#define LDLM_NBUFS (64 * smp_num_cpus) #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) #define LDLM_MAXREPSIZE (1024) -#define MDT_MAX_THREADS 32UL -#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \ - MDT_MAX_THREADS), 2UL) +#define MDS_MAX_THREADS 512UL +#define MDS_DEF_THREADS max(2UL, min_t(unsigned long, 32, \ + num_physpages * smp_num_cpus >> (26 - PAGE_SHIFT))) #define MDS_NBUFS (64 * smp_num_cpus) #define MDS_BUFSIZE (8 * 1024) /* Assume file name length = FNAME_MAX = 256 (true for ext3). @@ -398,7 +398,7 @@ CDEB_TYPE(level, "@@@ " fmt \ REQ_FLAGS_FMT"/%x/%x rc %d/%d\n" , ## args, req, req->rq_xid, \ req->rq_transno, \ req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \ - req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "", \ + req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : "", \ req->rq_import ? \ (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : "", \ (req->rq_import && req->rq_import->imp_client) ? \ @@ -707,7 +707,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, char *name, int id); int ptlrpc_unregister_service(struct ptlrpc_service *service); int liblustre_check_services (void *arg); -void ptlrpc_daemonize(void); +void ptlrpc_daemonize(char *name); int ptlrpc_service_health_check(struct ptlrpc_service *); @@ -774,6 +774,13 @@ int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); /* ptlrpc/pinger.c */ int ptlrpc_pinger_add_import(struct obd_import *imp); int ptlrpc_pinger_del_import(struct obd_import *imp); +#ifdef __KERNEL__ +void ping_evictor_start(void); +void ping_evictor_stop(void); +#else +#define ping_evictor_start() do {} while (0) +#define ping_evictor_stop() do {} while (0) +#endif /* ptlrpc/ptlrpcd.c */ void ptlrpcd_wake(struct ptlrpc_request *req); diff --git a/lustre/include/obd.h b/lustre/include/obd.h index cbbc10d..1f03420 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -22,8 +22,8 @@ #define IOC_MDC_TYPE 'i' #define IOC_MDC_MIN_NR 20 -#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) /* Moved to lustre_user.h +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) #define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */ #define IOC_MDC_MAX_NR 50 @@ -157,7 +157,7 @@ struct brw_page { enum async_flags { ASYNC_READY = 0x1, /* ap_make_ready will not be called before this page is added to an rpc */ - ASYNC_URGENT = 0x2, + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called to give the caller a chance to update or cancel the size of the io */ @@ -305,9 +305,11 @@ struct mds_server_data; #define OSC_MAX_DIRTY_MB_MAX 2048 /* totally arbitrary */ struct mdc_rpc_lock; +struct obd_import; struct client_obd { - struct obd_import *cl_import; struct semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ int cl_conn_count; /* max_mds_easize is purely a performance thing so we don't have to * call obd_size_diskmd() all the time. */ @@ -374,8 +376,8 @@ struct client_obd { /* used by quotacheck */ int cl_qchk_stat; /* quotacheck stat of the peer */ - struct ptlrpc_request_pool *cl_rq_pool; /* emergency pool of requests */ }; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) #define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ @@ -678,17 +680,19 @@ struct obd_device { #define OBD_LLOG_FL_SENDNOW 0x0001 +enum obd_cleanup_stage { /* Special case hack for MDS LOVs */ -#define OBD_CLEANUP_EARLY 0 + OBD_CLEANUP_EARLY, /* Precleanup stage 1, we must make sure all exports (other than the self-export) get destroyed. */ -#define OBD_CLEANUP_EXPORTS 1 + OBD_CLEANUP_EXPORTS, /* Precleanup stage 2, do other type-specific cleanup requiring the self-export. */ -#define OBD_CLEANUP_SELF_EXP 2 + OBD_CLEANUP_SELF_EXP, /* FIXME we should eliminate the "precleanup" function and make them stages of the "cleanup" function. */ -#define OBD_CLEANUP_OBD 3 + OBD_CLEANUP_OBD, +}; struct obd_ops { struct module *o_owner; @@ -696,12 +700,14 @@ struct obd_ops { void *karg, void *uarg); int (*o_get_info)(struct obd_export *, __u32 keylen, void *key, __u32 *vallen, void *val); - int (*o_set_info)(struct obd_export *, __u32 keylen, void *key, - __u32 vallen, void *val); + int (*o_set_info_async)(struct obd_export *, __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); int (*o_attach)(struct obd_device *dev, obd_count len, void *data); int (*o_detach)(struct obd_device *dev); int (*o_setup) (struct obd_device *dev, obd_count len, void *data); - int (*o_precleanup)(struct obd_device *dev, int cleanup_stage); + int (*o_precleanup)(struct obd_device *dev, + enum obd_cleanup_stage cleanup_stage); int (*o_cleanup)(struct obd_device *dev); int (*o_process_config)(struct obd_device *dev, obd_count len, void *data); diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index a8a9f75..78ec204 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -77,15 +77,6 @@ void oig_complete_one(struct obd_io_group *oig, struct oig_callback_context *occ, int rc); void oig_release(struct obd_io_group *oig); int oig_wait(struct obd_io_group *oig); -/* ping evictor */ -#ifdef __KERNEL__ -void ping_evictor_start(void); -void ping_evictor_stop(void); -#else -#define ping_evictor_start() do {} while (0) -#define ping_evictor_stop() do {} while (0) -#endif - char *obd_export_nid2str(struct obd_export *exp); @@ -98,6 +89,7 @@ int class_attach(struct lustre_cfg *lcfg); int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); +struct obd_device *class_incref(struct obd_device *obd); void class_decref(struct obd_device *obd); /* Passed as data param to class_config_parse_llog */ @@ -142,11 +134,10 @@ void __class_export_put(struct obd_export *); struct obd_export *class_new_export(struct obd_device *obddev, struct obd_uuid *cluuid); void class_unlink_export(struct obd_export *exp); -void class_update_export_timer(struct obd_export *exp, time_t extra_delay); struct obd_import *class_import_get(struct obd_import *); void class_import_put(struct obd_import *); -struct obd_import *class_new_import(void); +struct obd_import *class_new_import(struct obd_device *obd); void class_destroy_import(struct obd_import *exp); struct obd_type *class_get_type(char *name); @@ -260,16 +251,18 @@ static inline int obd_get_info(struct obd_export *exp, __u32 keylen, RETURN(rc); } -static inline int obd_set_info(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val) +static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) { int rc; ENTRY; - EXP_CHECK_OP(exp, set_info); - OBD_COUNTER_INCREMENT(exp->exp_obd, set_info); + EXP_CHECK_OP(exp, set_info_async); + OBD_COUNTER_INCREMENT(exp->exp_obd, set_info_async); - rc = OBP(exp->exp_obd, set_info)(exp, keylen, key, vallen, val); + rc = OBP(exp->exp_obd, set_info_async)(exp, keylen, key, vallen, val, + set); RETURN(rc); } @@ -285,7 +278,8 @@ static inline int obd_setup(struct obd_device *obd, int datalen, void *data) RETURN(rc); } -static inline int obd_precleanup(struct obd_device *obd, int cleanup_stage) +static inline int obd_precleanup(struct obd_device *obd, + enum obd_cleanup_stage cleanup_stage) { int rc; ENTRY; @@ -1180,7 +1174,6 @@ static inline void obdo_free(struct obdo *oa) * // XXX do not look into _superhack with remaining eye * // XXX if this were any uglier, I'd get my own show on MTV */ extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); -extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp); /* sysctl.c */ extern void obd_sysctl_init (void); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index cbf2c7a..2fa9852 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -36,7 +36,6 @@ extern unsigned int obd_timeout; /* seconds */ extern unsigned int ldlm_timeout; extern unsigned int obd_health_check_timeout; extern char obd_lustre_upcall[128]; -extern unsigned int obd_sync_filter; extern cfs_waitq_t obd_race_waitq; #define OBD_FAIL_MDS 0x100 @@ -151,6 +150,7 @@ extern cfs_waitq_t obd_race_waitq; #define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 #define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config index 5295a33..9b1c043 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config @@ -26,9 +26,9 @@ CONFIG_KMOD=y # # Processor type and features # -CONFIG_MK8=y +# CONFIG_MK8 # CONFIG_IA32E is not set -# CONFIG_GENERIC_CPU is not set +CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=64 CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_TSC=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config index 527d397..ea03f03 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config @@ -26,9 +26,9 @@ CONFIG_KMOD=y # # Processor type and features # -CONFIG_MK8=y +# CONFIG_MK8 is not set # CONFIG_IA32E is not set -# CONFIG_GENERIC_CPU is not set +CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=64 CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_TSC=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config index f621ca1..aa67bfe 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config @@ -438,6 +438,8 @@ CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SAS_CLASS=m +# CONFIG_SAS_DEBUG is not set # # SCSI low-level drivers @@ -452,6 +454,7 @@ CONFIG_AIC7XXX_RESET_DELAY_MS=15000 # CONFIG_AIC7XXX_DEBUG_ENABLE is not set CONFIG_AIC7XXX_DEBUG_MASK=0 # CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_AIC94XX is not set CONFIG_SCSI_AIC7XXX_OLD=m CONFIG_SCSI_AIC79XX=m CONFIG_AIC79XX_CMDS_PER_DEVICE=4 @@ -463,6 +466,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -539,10 +543,14 @@ CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +CONFIG_FUSION_OLD_MODULE_COMPAT=m # # IEEE 1394 (FireWire) support @@ -965,9 +973,11 @@ CONFIG_NS83820=m # CONFIG_YELLOWFIN is not set CONFIG_R8169=m CONFIG_R8169_NAPI=y +CONFIG_SKY2=m CONFIG_SK98LIN=m CONFIG_VIA_VELOCITY=m CONFIG_TIGON3=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1213,6 +1223,12 @@ CONFIG_ISDN_CAPI_CAPIDRV=m # Active AVM cards # CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m # # Active Eicon DIVA Server cards @@ -1318,6 +1334,7 @@ CONFIG_SERIAL_8250_RSA=y # CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set CONFIG_CRASH=m @@ -1865,9 +1882,20 @@ CONFIG_USB_SPEEDTOUCH=m # CONFIG_USB_GADGET is not set # +# InfiniBand support +# +# CONFIG_INFINIBAND is not set + +# +# EDAC - error detection and reporting (RAS) +# +# CONFIG_EDAC is not set + +# # Firmware Drivers # CONFIG_EDD=m +CONFIG_DELL_RBU=m # # File systems @@ -1972,6 +2000,7 @@ CONFIG_NFSD_TCP=y CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m +CONFIG_NFS_COMMON=y CONFIG_SUNRPC=m CONFIG_SUNRPC_GSS=m CONFIG_RPCSEC_GSS_KRB5=m diff --git a/lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config b/lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config index e7685c2..0ec6b4a 100644 --- a/lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config +++ b/lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config @@ -481,6 +481,7 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_QUOTA=y +CONFIG_QFMT_V1=m CONFIG_QFMT_V2=y CONFIG_QUOTACTL=y CONFIG_DNOTIFY=y diff --git a/lustre/kernel_patches/patches/export-filemap_populate.patch b/lustre/kernel_patches/patches/export-filemap_populate.patch deleted file mode 100644 index 8f78a79..0000000 --- a/lustre/kernel_patches/patches/export-filemap_populate.patch +++ /dev/null @@ -1,25 +0,0 @@ -Index: linux-2.6.7/mm/filemap.c -=================================================================== ---- linux-2.6.7.orig/mm/filemap.c 2004-11-15 12:02:35.000000000 +0800 -+++ linux-2.6.7/mm/filemap.c 2004-11-15 12:04:38.000000000 +0800 -@@ -1409,6 +1409,7 @@ - - return 0; - } -+EXPORT_SYMBOL_GPL(filemap_populate); - - static struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, -Index: linux-2.6.7/include/linux/mm.h -=================================================================== ---- linux-2.6.7.orig/include/linux/mm.h 2004-11-15 12:02:43.000000000 +0800 -+++ linux-2.6.7/include/linux/mm.h 2004-11-15 12:04:23.000000000 +0800 -@@ -661,6 +661,8 @@ - - /* generic vm_area_ops exported for stackable file systems */ - struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); -+int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long, -+ pgprot_t, unsigned long, int); - - /* mm/page-writeback.c */ - int write_one_page(struct page *page, int wait); diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch index a2b07f8..0561e65 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch @@ -42,18 +42,6 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/ext2_fs_sb.h /* * second extended-fs super-block data in memory */ -Index: linux-2.6.9-5.0.3.EL/net/core/sock.c -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/net/core/sock.c 2005-02-26 13:24:35.490810168 +0200 -+++ linux-2.6.9-5.0.3.EL/net/core/sock.c 2005-02-26 13:53:13.801587224 +0200 -@@ -602,6 +602,7 @@ - return -EFAULT; - return 0; - } -+EXPORT_SYMBOL(sock_getsockopt); - - static kmem_cache_t *sk_cachep; - Index: linux-2.6.9-5.0.3.EL/fs/namespace.c =================================================================== --- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-26 13:47:31.282658016 +0200 @@ -79,23 +67,6 @@ Index: linux-2.6.9-5.0.3.EL/kernel/exit.c void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; -@@ -428,6 +430,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ -@@ -516,6 +516,7 @@ - { - __exit_mm(tsk); - } -+EXPORT_SYMBOL(exit_mm); - - static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) - { Index: linux-2.6.9-5.0.3.EL/fs/dcache.c =================================================================== --- linux-2.6.9-5.0.3.EL.orig/fs/dcache.c 2005-02-26 13:49:04.365507272 +0200 @@ -108,50 +79,3 @@ Index: linux-2.6.9-5.0.3.EL/fs/dcache.c void d_genocide(struct dentry *root) { -Index: linux-2.6.9-5.0.3.EL/mm/filemap.c -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/mm/filemap.c 2005-02-26 13:24:35.502808344 +0200 -+++ linux-2.6.9-5.0.3.EL/mm/filemap.c 2005-02-26 13:53:59.787596288 +0200 -@@ -1473,7 +1473,7 @@ - return NULL; - } - --static int filemap_populate(struct vm_area_struct *vma, -+int filemap_populate(struct vm_area_struct *vma, - unsigned long addr, - unsigned long len, - pgprot_t prot, -@@ -1520,6 +1520,7 @@ - - return 0; - } -+EXPORT_SYMBOL_GPL(filemap_populate); - - struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, -Index: linux-2.6.9-5.0.3.EL/fs/file_table.c -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/file_table.c 2005-02-26 13:24:35.512806824 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/file_table.c 2005-02-26 13:53:13.811585704 +0200 -@@ -196,6 +196,7 @@ - file_free(file); - } - } -+EXPORT_SYMBOL(put_filp); - - void file_move(struct file *file, struct list_head *list) - { -Index: linux-2.6.9-5.0.3.EL/include/linux/mm.h -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/mm.h 2005-02-26 13:49:05.823285656 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/mm.h 2005-02-26 13:53:54.181448552 +0200 -@@ -721,6 +721,9 @@ - - /* generic vm_area_ops exported for stackable file systems */ - struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); -+int filemap_populate(struct vm_area_struct *vma, unsigned long addr, -+ unsigned long len, pgprot_t prot, unsigned long pgoff, -+ int nonblock); - - /* mm/page-writeback.c */ - int write_one_page(struct page *page, int wait); diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch index fbaf63d..8360ce4 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch @@ -55,12 +55,3 @@ Index: linux-2.6.5-12.1/kernel/exit.c void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; -@@ -429,6 +431,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch index c08e30f..e21fcf4 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch @@ -25,18 +25,6 @@ Index: linux-2.6.12-rc6/include/linux/fs.h #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int vfs_readlink(struct dentry *, char __user *, int, const char *); -Index: linux-2.6.12-rc6/net/core/sock.c -=================================================================== ---- linux-2.6.12-rc6.orig/net/core/sock.c 2005-06-06 17:22:29.000000000 +0200 -+++ linux-2.6.12-rc6/net/core/sock.c 2005-06-14 15:53:58.349304101 +0200 -@@ -613,6 +613,7 @@ - return -EFAULT; - return 0; - } -+EXPORT_SYMBOL(sock_getsockopt); - - /** - * sk_alloc - All socket objects are allocated here Index: linux-2.6.12-rc6/fs/namespace.c =================================================================== --- linux-2.6.12-rc6.orig/fs/namespace.c 2005-06-14 15:53:17.868835847 +0200 @@ -62,23 +50,6 @@ Index: linux-2.6.12.5/kernel/exit.c void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; -@@ -432,6 +434,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ -@@ -515,6 +515,7 @@ - task_unlock(tsk); - mmput(mm); - } -+EXPORT_SYMBOL(exit_mm); - - static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) - { Index: linux-2.6.12-rc6/fs/dcache.c =================================================================== --- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200 @@ -91,15 +62,3 @@ Index: linux-2.6.12-rc6/fs/dcache.c void d_genocide(struct dentry *root) { -Index: linux-2.6.12-rc6/fs/file_table.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/file_table.c 2005-06-06 17:22:29.000000000 +0200 -+++ linux-2.6.12-rc6/fs/file_table.c 2005-06-14 15:53:58.396179101 +0200 -@@ -197,6 +197,7 @@ - file_free(file); - } - } -+EXPORT_SYMBOL(put_filp); - - void file_move(struct file *file, struct list_head *list) - { diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch index 588916f..72f5dd5 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch @@ -179,9 +179,9 @@ Index: linux-2.4.21-rhel/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -2591,7 +2591,7 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h =================================================================== --- linux-2.4.21-rhel.orig/include/linux/ext3_extents.h 2005-03-02 22:42:20.659360368 +0300 +++ linux-2.4.21-rhel/include/linux/ext3_extents.h 2005-03-04 02:34:52.000000000 +0300 -@@ -0,0 +1,263 @@ +@@ -0,0 +1,261 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2689,7 +2689,7 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2790,15 +2790,13 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch index 305ef8e..940b916 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch @@ -179,9 +179,9 @@ Index: linux-2.4.21-suse2/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -2589,7 +2589,7 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h =================================================================== --- linux-2.4.21-suse2.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.4.21-suse2/include/linux/ext3_extents.h 2004-11-02 20:34:00.000000000 +0300 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,261 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2687,7 +2687,7 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2788,15 +2788,13 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); @@ -2853,7 +2851,6 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h + + +#endif /* _LINUX_EXT3_EXTENTS */ -+ Index: linux-2.4.21-suse2/include/linux/ext3_fs_i.h =================================================================== --- linux-2.4.21-suse2.orig/include/linux/ext3_fs_i.h 2004-11-02 20:31:37.000000000 +0300 diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch index 8e84625..571fb0f 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch @@ -179,9 +179,9 @@ Index: linux-2.4.24/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -2577,7 +2577,7 @@ Index: linux-2.4.24/include/linux/ext3_extents.h =================================================================== --- linux-2.4.24.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.4.24/include/linux/ext3_extents.h 2004-11-02 20:32:17.000000000 +0300 -@@ -0,0 +1,263 @@ +@@ -0,0 +1,261 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2675,7 +2675,7 @@ Index: linux-2.4.24/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2776,15 +2776,13 @@ Index: linux-2.4.24/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch index d77d9a7..125f747 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch @@ -179,9 +179,9 @@ Index: linux-2.4.29/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -2578,7 +2578,7 @@ Index: linux-2.4.29/include/linux/ext3_extents.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_extents.h 2005-05-03 16:52:08.724069800 +0300 +++ linux-2.4.29/include/linux/ext3_extents.h 2005-05-03 16:52:08.819055360 +0300 -@@ -0,0 +1,263 @@ +@@ -0,0 +1,261 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2676,7 +2676,7 @@ Index: linux-2.4.29/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2777,15 +2777,13 @@ Index: linux-2.4.29/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch index 657ecf4..b6439e6 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -2,7 +2,7 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c =================================================================== --- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 +++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 -@@ -0,0 +1,2347 @@ +@@ -0,0 +1,2353 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -176,9 +176,9 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -448,8 +448,12 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -506,8 +510,10 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2644,7 +2650,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h =================================================================== --- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 +++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2742,7 +2748,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2843,15 +2849,13 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch index 0ee8d28..9e78214 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -3,7 +3,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 +++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 -@@ -0,0 +1,2349 @@ +@@ -0,0 +1,2355 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -177,9 +177,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -449,8 +449,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -507,8 +511,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2634,7 +2640,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h =================================================================== --- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 +++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2732,7 +2738,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2833,15 +2839,13 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 56fe653..bd95c54 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2,7 +2,7 @@ Index: linux-stage/fs/ext3/extents.c =================================================================== --- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 +++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 -@@ -0,0 +1,2347 @@ +@@ -0,0 +1,2353 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -176,9 +176,9 @@ Index: linux-stage/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_GENERATION(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -448,8 +448,12 @@ Index: linux-stage/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; + goto err; ++ } + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); @@ -506,8 +510,10 @@ Index: linux-stage/fs/ext3/extents.c + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ ext3_ext_drop_refs(path); -+ kfree(path); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + return ERR_PTR(-EIO); +} + @@ -2629,7 +2635,7 @@ Index: linux-stage/include/linux/ext3_extents.h =================================================================== --- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 +++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 -@@ -0,0 +1,264 @@ +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2727,7 +2733,7 @@ Index: linux-stage/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2828,15 +2834,13 @@ Index: linux-stage/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 1d8a4af..2a64875 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -2570,7 +2570,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_old(handle, inode, block, count); + else { + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 0c2f445..70f4f8a 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -2565,7 +2565,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index 5ff3d3b..01e7387 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -2584,7 +2584,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch index 40bbaa5..3273075 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch @@ -26,7 +26,7 @@ Index: linux/fs/ext3/namei.c int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -98,7 +98,7 @@ Index: linux/fs/ext3/namei.c return -EPERM; - if (inode->i_nlink >= EXT3_LINK_MAX) { -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; - } @@ -111,7 +111,7 @@ Index: linux/fs/ext3/namei.c - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -154,24 +154,3 @@ Index: linux/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -580,14 +580,15 @@ - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch index 4543943..4c3ebb8 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch @@ -26,7 +26,7 @@ Index: 69chaos/fs/ext3/namei.c int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -98,7 +98,7 @@ Index: 69chaos/fs/ext3/namei.c return -EPERM; - if (inode->i_nlink >= EXT3_LINK_MAX) { -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; - } @@ -111,7 +111,7 @@ Index: 69chaos/fs/ext3/namei.c - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -154,24 +154,3 @@ Index: 69chaos/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -582,14 +582,15 @@ - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch index 245d83e..621d1b3 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch @@ -24,7 +24,7 @@ int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -96,7 +96,7 @@ return -EPERM; - if (inode->i_nlink >= EXT3_LINK_MAX) { -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; - } @@ -109,7 +109,7 @@ - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -150,24 +150,3 @@ /* * Macro-instructions used to manage several block sizes -@@ -581,14 +581,15 @@ - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch index bb9fc1b..0d360fa 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch @@ -26,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -86,7 +86,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -97,7 +97,7 @@ Index: linux-2.6.7/fs/ext3/namei.c - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch index 62bf156..37cca81 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -20,16 +20,16 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c } static int ext3_add_nondir(handle_t *handle, -@@ -1706,7 +1712,7 @@ +@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t struct ext3_dir_entry_2 * de; int err, retries = 0; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; retry: -@@ -1729,7 +1735,7 @@ +@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext3_bread (handle, inode, 0, 1, &err); if (!dir_block) { @@ -38,7 +38,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; -@@ -1761,7 +1767,7 @@ +@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode iput (inode); goto out_stop; } @@ -47,7 +47,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); -@@ -2026,10 +2032,10 @@ +@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode retval = ext3_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; @@ -62,7 +62,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c inode->i_version++; inode->i_nlink = 0; /* There's no need to set i_disksize: the fact that i_nlink is -@@ -2039,7 +2045,7 @@ +@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); @@ -71,7 +71,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); -@@ -2090,7 +2096,7 @@ +@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); @@ -80,27 +80,27 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c if (!inode->i_nlink) ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime; -@@ -2165,7 +2171,7 @@ +@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; retry: -@@ -2252,8 +2258,8 @@ +@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { -@@ -2310,7 +2316,7 @@ +@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode } if (new_inode) { @@ -109,7 +109,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c new_inode->i_ctime = CURRENT_TIME_SEC; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -@@ -2321,11 +2327,13 @@ +@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, dir_bh); @@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/iallocsem_consistency.patch b/lustre/kernel_patches/patches/iallocsem_consistency.patch new file mode 100644 index 0000000..916ba88 --- /dev/null +++ b/lustre/kernel_patches/patches/iallocsem_consistency.patch @@ -0,0 +1,48 @@ +Index: linux-2.6.9/fs/attr.c +=================================================================== +--- linux-2.6.9/fs.orig/attr.c 2006-03-10 17:20:39.000000000 +0200 ++++ linux-2.6.9/fs/attr.c 2006-04-09 01:21:44.000000000 +0300 +@@ -177,6 +177,9 @@ + if (!attr->ia_valid) + return 0; + ++ if (ia_valid & ATTR_SIZE) ++ down_write(&dentry->d_inode->i_alloc_sem); ++ + if (inode->i_op && inode->i_op->setattr) { + audit_notify_watch(inode, MAY_WRITE); + error = security_inode_setattr(dentry, attr); +@@ -194,6 +197,10 @@ + error = inode_setattr(inode, attr); + } + } ++ ++ if (ia_valid & ATTR_SIZE) ++ up_write(&dentry->d_inode->i_alloc_sem); ++ + if (!error) { + unsigned long dn_mask = setattr_mask(ia_valid); + if (dn_mask) +Index: linux-2.6.9/fs/open.c +=================================================================== +--- linux-2.6.9/fs.orig/open.c 2006-04-09 01:18:08.000000000 +0300 ++++ linux-2.6.9/fs/open.c 2006-04-09 01:22:29.000000000 +0300 +@@ -205,16 +205,16 @@ + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + down(&dentry->d_inode->i_sem); +- down_write(&dentry->d_inode->i_alloc_sem); + if (called_from_open) + newattrs.ia_valid |= ATTR_FROM_OPEN; + if (op->setattr_raw) { + newattrs.ia_valid |= ATTR_RAW; + newattrs.ia_ctime = CURRENT_TIME; ++ down_write(&dentry->d_inode->i_alloc_sem); + err = op->setattr_raw(dentry->d_inode, &newattrs); ++ up_write(&dentry->d_inode->i_alloc_sem); + } else + err = notify_change(dentry, &newattrs); +- up_write(&dentry->d_inode->i_alloc_sem); + up(&dentry->d_inode->i_sem); + return err; + } diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch index 47c152c..c75d7e8 100644 --- a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch @@ -1,8 +1,8 @@ -Index: uml/fs/cifs/dir.c +Index: linux-2.6.10/fs/cifs/dir.c =================================================================== ---- uml.orig/fs/cifs/dir.c 2004-12-24 16:35:01.000000000 -0500 -+++ uml/fs/cifs/dir.c 2005-04-13 23:43:03.681625568 -0400 -@@ -199,23 +199,23 @@ +--- linux-2.6.10.orig/fs/cifs/dir.c ++++ linux-2.6.10/fs/cifs/dir.c +@@ -199,23 +199,23 @@ cifs_create(struct inode *inode, struct } if(nd) { @@ -32,11 +32,11 @@ Index: uml/fs/cifs/dir.c disposition = FILE_OPEN_IF; else { cFYI(1,("Create flag not set in create function")); -Index: uml/fs/nfs/nfs4proc.c +Index: linux-2.6.10/fs/nfs/nfs4proc.c =================================================================== ---- uml.orig/fs/nfs/nfs4proc.c 2004-12-24 16:35:23.000000000 -0500 -+++ uml/fs/nfs/nfs4proc.c 2005-04-13 23:43:26.409770503 -0400 -@@ -775,17 +775,17 @@ +--- linux-2.6.10.orig/fs/nfs/nfs4proc.c ++++ linux-2.6.10/fs/nfs/nfs4proc.c +@@ -775,17 +775,17 @@ nfs4_atomic_open(struct inode *dir, stru struct nfs4_state *state; if (nd->flags & LOOKUP_CREATE) { @@ -57,11 +57,20 @@ Index: uml/fs/nfs/nfs4proc.c put_rpccred(cred); if (IS_ERR(state)) return (struct inode *)state; -Index: uml/fs/nfs/dir.c +Index: linux-2.6.10/fs/nfs/dir.c =================================================================== ---- uml.orig/fs/nfs/dir.c 2005-04-13 23:42:21.792883770 -0400 -+++ uml/fs/nfs/dir.c 2005-04-13 23:43:03.685625066 -0400 -@@ -791,7 +791,7 @@ +--- linux-2.6.10.orig/fs/nfs/dir.c ++++ linux-2.6.10/fs/nfs/dir.c +@@ -718,7 +718,7 @@ int nfs_is_exclusive_create(struct inode + return 0; + if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +@@ -791,7 +791,7 @@ static int is_atomic_open(struct inode * if (nd->flags & LOOKUP_DIRECTORY) return 0; /* Are we trying to write to a read only partition? */ @@ -70,7 +79,7 @@ Index: uml/fs/nfs/dir.c return 0; return 1; } -@@ -812,7 +812,7 @@ +@@ -812,7 +812,7 @@ static struct dentry *nfs_atomic_lookup( dentry->d_op = NFS_PROTO(dir)->dentry_ops; /* Let vfs_create() deal with O_EXCL */ @@ -79,7 +88,7 @@ Index: uml/fs/nfs/dir.c goto no_entry; /* Open the file on the server */ -@@ -820,7 +820,7 @@ +@@ -820,7 +820,7 @@ static struct dentry *nfs_atomic_lookup( /* Revalidate parent directory attribute cache */ nfs_revalidate_inode(NFS_SERVER(dir), dir); @@ -88,7 +97,7 @@ Index: uml/fs/nfs/dir.c nfs_begin_data_update(dir); inode = nfs4_atomic_open(dir, dentry, nd); nfs_end_data_update(dir); -@@ -836,7 +836,7 @@ +@@ -836,7 +836,7 @@ static struct dentry *nfs_atomic_lookup( break; /* This turned out not to be a regular file */ case -ELOOP: @@ -97,7 +106,7 @@ Index: uml/fs/nfs/dir.c goto no_open; /* case -EISDIR: */ /* case -EINVAL: */ -@@ -875,7 +875,7 @@ +@@ -875,7 +875,7 @@ static int nfs_open_revalidate(struct de /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) goto no_open; @@ -106,3 +115,13 @@ Index: uml/fs/nfs/dir.c /* We cannot do exclusive creation on a positive dentry */ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open; +@@ -1043,7 +1043,8 @@ static int nfs_create(struct inode *dir, + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; ++ + + /* + * The 0 argument passed into the create function should one day diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch index 77d5b30..0adb06c 100644 --- a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch @@ -2,6 +2,15 @@ Index: linux-2.6.5-7.108/fs/nfs/dir.c =================================================================== --- linux-2.6.5-7.108.orig/fs/nfs/dir.c 2004-09-15 19:26:43.012732408 +0300 +++ linux-2.6.5-7.108/fs/nfs/dir.c 2004-09-15 20:03:32.882781096 +0300 +@@ -709,7 +709,7 @@ + return 0; + if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) @@ -782,7 +782,7 @@ if (nd->flags & LOOKUP_DIRECTORY) return 0; @@ -47,6 +56,15 @@ Index: linux-2.6.5-7.108/fs/nfs/dir.c if (openflags & O_CREAT) { /* If this is a negative dentry, just drop it */ if (!inode) +@@ -1026,7 +1026,7 @@ + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; + + /* + * The 0 argument passed into the create function should one day Index: linux-2.6.5-7.108/fs/nfs/nfs4proc.c =================================================================== --- linux-2.6.5-7.108.orig/fs/nfs/nfs4proc.c 2004-04-04 06:37:39.000000000 +0300 diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch index 41e5ecb..ff06d68 100644 --- a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch @@ -2,6 +2,15 @@ Index: linux-2.6.12-rc6/fs/nfs/dir.c =================================================================== --- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200 +++ linux-2.6.12-rc6/fs/nfs/dir.c 2005-06-14 14:26:39.884524523 +0200 +@@ -727,7 +727,7 @@ + return 0; + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) @@ -783,7 +783,7 @@ if (nd->flags & LOOKUP_DIRECTORY) return 0; @@ -47,6 +56,15 @@ Index: linux-2.6.12-rc6/fs/nfs/dir.c /* We cannot do exclusive creation on a positive dentry */ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open; +@@ -1028,7 +1028,7 @@ + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; + + lock_kernel(); + nfs_begin_data_update(dir); Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c =================================================================== --- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c 2005-06-06 17:22:29.000000000 +0200 diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch new file mode 100644 index 0000000..a0245be --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch @@ -0,0 +1,459 @@ +diff -Nur linux-2.6.12.6-orig/include/linux/skbuff.h linux-2.6.12.6/include/linux/skbuff.h +--- linux-2.6.12.6-orig/include/linux/skbuff.h 2006-03-14 19:40:26.000000000 +0800 ++++ linux-2.6.12.6/include/linux/skbuff.h 2006-03-16 17:04:51.000000000 +0800 +@@ -128,6 +128,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -137,6 +161,13 @@ + unsigned short tso_size; + unsigned short tso_segs; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ ++ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +diff -Nur linux-2.6.12.6-orig/include/net/tcp.h linux-2.6.12.6/include/net/tcp.h +--- linux-2.6.12.6-orig/include/net/tcp.h 2005-06-18 03:48:29.000000000 +0800 ++++ linux-2.6.12.6/include/net/tcp.h 2006-03-16 17:05:02.000000000 +0800 +@@ -783,6 +783,9 @@ + extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); ++ + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -879,6 +882,9 @@ + struct msghdr *msg, + size_t len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern int tcp_listen_start(struct sock *sk); + +diff -Nur linux-2.6.12.6-orig/net/core/dev.c linux-2.6.12.6/net/core/dev.c +--- linux-2.6.12.6-orig/net/core/dev.c 2005-06-18 03:48:29.000000000 +0800 ++++ linux-2.6.12.6/net/core/dev.c 2006-03-16 17:04:36.000000000 +0800 +@@ -1176,6 +1176,9 @@ + ninfo->tso_segs = skb_shinfo(skb)->tso_segs; + ninfo->nr_frags = 0; + ninfo->frag_list = NULL; ++ ninfo->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ ninfo->zccd2 = NULL; ++ + + /* Offset between the two in bytes */ + offset = data - skb->head; +diff -Nur linux-2.6.12.6-orig/net/core/skbuff.c linux-2.6.12.6/net/core/skbuff.c +--- linux-2.6.12.6-orig/net/core/skbuff.c 2005-06-18 03:48:29.000000000 +0800 ++++ linux-2.6.12.6/net/core/skbuff.c 2006-03-16 17:04:41.000000000 +0800 +@@ -159,6 +159,9 @@ + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; ++ + out: + return skb; + nodata: +@@ -247,6 +250,10 @@ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -529,6 +536,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -571,6 +586,9 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ ++ + + if (skb_shared(skb)) + BUG(); +@@ -592,6 +610,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data + nhead) - skb->head; +@@ -606,6 +629,8 @@ + skb->cloned = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +diff -Nur linux-2.6.12.6-orig/net/ipv4/tcp.c linux-2.6.12.6/net/ipv4/tcp.c +--- linux-2.6.12.6-orig/net/ipv4/tcp.c 2005-06-18 03:48:29.000000000 +0800 ++++ linux-2.6.12.6/net/ipv4/tcp.c 2006-03-16 17:04:57.000000000 +0800 +@@ -630,8 +630,10 @@ + } + } + ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags) ++ size_t psize, int flags, zccd_t *zccd) ++ + { + struct tcp_sock *tp = tcp_sk(sk); + int mss_now; +@@ -678,6 +680,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); +@@ -694,6 +707,20 @@ + skb_fill_page_desc(skb, i, page, offset, copy); + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; +@@ -762,12 +789,37 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ + #define TCP_PAGE(sk) (sk->sk_sndmsg_page) + #define TCP_OFF(sk) (sk->sk_sndmsg_off) + +@@ -1530,6 +1582,202 @@ + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->sk_state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->sk_err || ++ sk->sk_state == TCP_CLOSE || ++ (sk->sk_shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sock_flag(sk, SOCK_DONE)) ++ break; ++ ++ if (sk->sk_err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->sk_state == TCP_CLOSE) { ++ if (!(sock_flag(sk, SOCK_DONE))) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ sk_wait_data(sk, &timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!(sock_flag(sk, SOCK_URGINLINE))) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->sk_receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ sk_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ sk_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some +@@ -2380,6 +2628,8 @@ + EXPORT_SYMBOL(tcp_recvmsg); + EXPORT_SYMBOL(tcp_sendmsg); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_setsockopt); + EXPORT_SYMBOL(tcp_shutdown); + EXPORT_SYMBOL(tcp_statistics); diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch new file mode 100644 index 0000000..06baac2 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch @@ -0,0 +1,545 @@ +diff -Nur linux-2.6.5-7.244-orig/include/linux/skbuff.h linux-2.6.5-7.244/include/linux/skbuff.h +--- linux-2.6.5-7.244-orig/include/linux/skbuff.h 2005-12-13 07:50:31.000000000 +0800 ++++ linux-2.6.5-7.244/include/linux/skbuff.h 2006-03-13 16:31:30.000000000 +0800 +@@ -135,6 +135,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -144,6 +168,12 @@ + unsigned short tso_size; + unsigned short tso_segs; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +diff -Nur linux-2.6.5-7.244-orig/include/net/sock.h linux-2.6.5-7.244/include/net/sock.h +--- linux-2.6.5-7.244-orig/include/net/sock.h 2005-12-13 07:50:33.000000000 +0800 ++++ linux-2.6.5-7.244/include/net/sock.h 2006-03-13 16:32:36.000000000 +0800 +@@ -413,6 +413,18 @@ + (__skb)->next = NULL; \ + } while(0) + ++#define sk_wait_event(__sk, __timeo, __condition) \ ++({ int rc; \ ++ release_sock(__sk); \ ++ rc = __condition; \ ++ if (!rc) { \ ++ *(__timeo) = schedule_timeout(*(__timeo)); \ ++ rc = __condition; \ ++ } \ ++ lock_sock(__sk); \ ++ rc; \ ++}) ++ + /* IP protocol blocks we attach to sockets. + * socket layer -> transport layer interface + * transport -> network interface is defined by struct inet_proto +@@ -1037,6 +1049,20 @@ + sk->sk_stamp = *stamp; + } + ++/** ++ * sk_eat_skb - Release a skb if it is no longer needed ++ * @sk - socket to eat this skb from ++ * @skb - socket buffer to eat ++ * ++ * This routine must be called with interrupts disabled or with the socket ++ * locked so that the sk_buff queue operation is ok. ++*/ ++static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ __skb_unlink(skb, &sk->sk_receive_queue); ++ __kfree_skb(skb); ++} ++ + extern atomic_t netstamp_needed; + extern void sock_enable_timestamp(struct sock *sk); + extern void sock_disable_timestamp(struct sock *sk); +diff -Nur linux-2.6.5-7.244-orig/include/net/tcp.h linux-2.6.5-7.244/include/net/tcp.h +--- linux-2.6.5-7.244-orig/include/net/tcp.h 2005-12-13 07:50:21.000000000 +0800 ++++ linux-2.6.5-7.244/include/net/tcp.h 2006-03-13 16:31:37.000000000 +0800 +@@ -764,6 +764,9 @@ + extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); ++ + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -861,6 +864,10 @@ + size_t len, int nonblock, + int flags, int *addr_len); + ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); ++ + extern int tcp_listen_start(struct sock *sk); + + extern void tcp_parse_options(struct sk_buff *skb, +diff -Nur linux-2.6.5-7.244-orig/net/core/dev.c linux-2.6.5-7.244/net/core/dev.c +--- linux-2.6.5-7.244-orig/net/core/dev.c 2005-12-13 07:50:38.000000000 +0800 ++++ linux-2.6.5-7.244/net/core/dev.c 2006-03-13 16:31:56.000000000 +0800 +@@ -1322,6 +1322,9 @@ + ninfo->tso_segs = skb_shinfo(skb)->tso_segs; + ninfo->nr_frags = 0; + ninfo->frag_list = NULL; ++ ninfo->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ ninfo->zccd2 = NULL; ++ + + /* Offset between the two in bytes */ + offset = data - skb->head; +diff -Nur linux-2.6.5-7.244-orig/net/core/skbuff.c linux-2.6.5-7.244/net/core/skbuff.c +--- linux-2.6.5-7.244-orig/net/core/skbuff.c 2004-04-04 11:37:37.000000000 +0800 ++++ linux-2.6.5-7.244/net/core/skbuff.c 2006-03-13 16:31:46.000000000 +0800 +@@ -152,6 +152,9 @@ + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; ++ + out: + return skb; + nodata: +@@ -186,6 +189,10 @@ + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -449,6 +456,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -493,6 +508,9 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ ++ + + if (skb_shared(skb)) + BUG(); +@@ -514,6 +532,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data + nhead) - skb->head; +@@ -527,6 +550,9 @@ + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; ++ + return 0; + + nodata: +diff -Nur linux-2.6.5-7.244-orig/net/core/sock.c linux-2.6.5-7.244/net/core/sock.c +--- linux-2.6.5-7.244-orig/net/core/sock.c 2005-12-13 07:50:10.000000000 +0800 ++++ linux-2.6.5-7.244/net/core/sock.c 2006-03-13 16:32:44.000000000 +0800 +@@ -917,6 +917,31 @@ + } while((skb = sk->sk_backlog.head) != NULL); + } + ++/** ++ * sk_wait_data - wait for data to arrive at sk_receive_queue ++ * sk - sock to wait on ++ * timeo - for how long ++ * ++ * Now socket state including sk->sk_err is changed only under lock, ++ * hence we may omit checks after joining wait queue. ++ * We check receive queue before schedule() only as optimization; ++ * it is very likely that release_sock() added new data. ++ */ ++int sk_wait_data(struct sock *sk, long *timeo) ++{ ++ int rc; ++ DEFINE_WAIT(wait); ++ ++ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); ++ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); ++ rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); ++ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); ++ finish_wait(sk->sk_sleep, &wait); ++ return rc; ++} ++ ++EXPORT_SYMBOL(sk_wait_data); ++ + /* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain +diff -Nur linux-2.6.5-7.244-orig/net/ipv4/tcp.c linux-2.6.5-7.244/net/ipv4/tcp.c +--- linux-2.6.5-7.244-orig/net/ipv4/tcp.c 2005-12-13 07:50:28.000000000 +0800 ++++ linux-2.6.5-7.244/net/ipv4/tcp.c 2006-03-13 16:32:04.000000000 +0800 +@@ -799,7 +799,7 @@ + } + + ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags); ++ size_t psize, int flags,zccd_t *zccd); + + static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page, + int off) +@@ -881,8 +881,9 @@ + return err; + } + ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ + ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags) ++ size_t psize, int flags,zccd_t *zccd) + { + struct tcp_opt *tp = tcp_sk(sk); + int mss_now; +@@ -929,6 +930,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i - 1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -939,6 +951,20 @@ + goto new_segment; + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -1003,12 +1029,36 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ + #define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page) + #define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off) + +@@ -1849,6 +1899,202 @@ + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; + } ++ ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++int len, int nonblock) ++{ ++ struct tcp_opt *tp = tcp_sk(sk); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->sk_state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->sk_err || ++ sk->sk_state == TCP_CLOSE || ++ (sk->sk_shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sock_flag(sk, SOCK_DONE)) ++ break; ++ ++ if (sk->sk_err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->sk_state == TCP_CLOSE) { ++ if (!(sock_flag(sk, SOCK_DONE))) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ sk_wait_data(sk, &timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!(sock_flag(sk, SOCK_URGINLINE))) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->sk_receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ sk_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ sk_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} + + /* + * State processing on a close. This implements the state shift for +@@ -2872,6 +3118,8 @@ + EXPORT_SYMBOL(tcp_recvmsg); + EXPORT_SYMBOL(tcp_sendmsg); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_setsockopt); + EXPORT_SYMBOL(tcp_shutdown); + EXPORT_SYMBOL(tcp_sockets_allocated); diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.9-rhel4.patch index 799b89f1..2b6a0da 100644 --- a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.9-rhel4.patch @@ -214,8 +214,8 @@ + +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) + -+ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ -+ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste */ ++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))/* time on double mapping */ + BUG (); + +#undef TCP_ZC_CSUM_FLAGS diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch new file mode 100644 index 0000000..694d097 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch @@ -0,0 +1,773 @@ +Index: linux-2.6.10/fs/exec.c +=================================================================== +--- linux-2.6.10.orig/fs/exec.c ++++ linux-2.6.10/fs/exec.c +@@ -124,9 +124,10 @@ asmlinkage long sys_uselib(const char __ + struct file * file; + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_OPEN); + +- nd.intent.open.flags = FMODE_READ; +- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + if (error) + goto out; + +@@ -138,7 +139,7 @@ asmlinkage long sys_uselib(const char __ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -485,8 +486,9 @@ struct file *open_exec(const char *name) + int err; + struct file *file; + +- nd.intent.open.flags = FMODE_READ; +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ intent_init(&nd.intent, IT_OPEN); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ err = path_lookup(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + + if (!err) { +@@ -499,7 +501,7 @@ struct file *open_exec(const char *name) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +Index: linux-2.6.10/fs/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/inode.c ++++ linux-2.6.10/fs/inode.c +@@ -233,6 +233,7 @@ void __iget(struct inode * inode) + inodes_stat.nr_unused--; + } + ++EXPORT_SYMBOL(__iget); + /** + * clear_inode - clear an inode + * @inode: inode to clear +Index: linux-2.6.10/fs/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/namei.c ++++ linux-2.6.10/fs/namei.c +@@ -288,8 +288,19 @@ int deny_write_access(struct file * file + return 0; + } + ++void intent_release(struct lookup_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->it_magic != INTENT_MAGIC) ++ return; ++ if (it->it_op_release) ++ it->it_op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -379,7 +390,10 @@ static struct dentry * real_lookup(struc + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + ++again: ++ counter++; + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -418,7 +432,10 @@ static struct dentry * real_lookup(struc + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +- result = ERR_PTR(-ENOENT); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; + } + } + return result; +@@ -448,7 +465,9 @@ walk_init_root(const char *name, struct + static inline int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct lookup_intent it = nd->intent; + char *name; ++ + if (IS_ERR(link)) + goto fail; + +@@ -458,6 +477,9 @@ static inline int __vfs_follow_link(stru + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_init(&nd->intent, it.it_op); ++ nd->intent.it_flags = it.it_flags; ++ nd->intent.it_create_mode = it.it_create_mode; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -666,6 +688,33 @@ fail: + return PTR_ERR(dentry); + } + ++static int revalidate_special(struct nameidata *nd) ++{ ++ struct dentry *dentry = nd->dentry; ++ int err, counter = 0; ++ ++ revalidate_again: ++ if (!dentry->d_op || !dentry->d_op->d_revalidate) ++ return 0; ++ if (!dentry->d_op->d_revalidate(dentry, nd)) { ++ struct dentry *new; ++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd))) ++ return err; ++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ d_invalidate(dentry); ++ dput(dentry); ++ nd->dentry = dentry = new; ++ counter++; ++ if (counter < 10) ++ goto revalidate_again; ++ printk("excessive revalidate_it loops\n"); ++ return -ESTALE; ++ } ++ return 0; ++} ++ + /* + * Name resolution. + * +@@ -767,8 +816,12 @@ int fastcall link_path_walk(const char * + goto out_dput; + + if (inode->i_op->follow_link) { ++ int save_flags = nd->flags; + mntget(next.mnt); ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(next.dentry, nd); ++ if (!(save_flags & LOOKUP_LINK_NOTLAST)) ++ nd->flags &= ~LOOKUP_LINK_NOTLAST; + dput(next.dentry); + mntput(next.mnt); + if (err) +@@ -807,14 +860,34 @@ last_component: + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: ++ nd->flags |= LOOKUP_LAST; ++ err = revalidate_special(nd); ++ nd->flags &= ~LOOKUP_LAST; ++ if (!nd->dentry->d_inode) ++ err = -ENOENT; ++ if (err) { ++ path_release(nd); ++ goto return_err; ++ } ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if (!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup){ ++ path_release(nd); ++ goto return_err; ++ } ++ } + goto return_reval; + } ++ + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { + err = nd->dentry->d_op->d_hash(nd->dentry, &this); + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next, atomic); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + follow_mount(&next.mnt, &next.dentry); +@@ -1032,7 +1105,7 @@ struct dentry * lookup_hash(struct qstr + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -1052,11 +1125,16 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1068,7 +1146,7 @@ access: + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); +@@ -1080,6 +1158,12 @@ int fastcall __user_walk(const char __us + return err; + } + ++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent, IT_LOOKUP); ++ return __user_walk_it(name, flags, nd); ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -1363,8 +1447,8 @@ int open_namei(const char * pathname, in + acc_mode |= MAY_APPEND; + + /* Fill in the open() intent data */ +- nd->intent.open.flags = flag; +- nd->intent.open.create_mode = mode; ++ nd->intent.it_flags = flag; ++ nd->intent.it_create_mode = mode; + + /* + * The simplest case - just a plain lookup. +@@ -1379,6 +1463,7 @@ int open_namei(const char * pathname, in + /* + * Create - we need to know the parent. + */ ++ nd->intent.it_op |= IT_CREAT; + error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); + if (error) + return error; +@@ -1395,7 +1480,9 @@ int open_namei(const char * pathname, in + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + + do_last: + error = PTR_ERR(dentry); +@@ -1508,7 +1595,9 @@ do_link: + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + putname(nd->last.name); + goto do_last; + } +Index: linux-2.6.10/fs/namespace.c +=================================================================== +--- linux-2.6.10.orig/fs/namespace.c ++++ linux-2.6.10/fs/namespace.c +@@ -62,6 +62,7 @@ struct vfsmount *alloc_vfsmnt(const char + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_fslink); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name)+1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -113,6 +114,7 @@ static inline int check_mnt(struct vfsmo + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +@@ -176,6 +178,9 @@ void __mntput(struct vfsmount *mnt) + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } +@@ -402,6 +407,8 @@ static int do_umount(struct vfsmount *mn + */ + + lock_kernel(); ++ if (sb->s_op->umount_lustre) ++ sb->s_op->umount_lustre(sb); + if( (flags&MNT_FORCE) && sb->s_op->umount_begin) + sb->s_op->umount_begin(sb); + unlock_kernel(); +@@ -627,6 +634,7 @@ static int do_loopback(struct nameidata + return err; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -701,6 +709,7 @@ static int do_move_mount(struct nameidat + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -1012,6 +1021,7 @@ long do_mount(char * dev_name, char * di + int retval = 0; + int mnt_flags = 0; + ++ intent_init(&nd.intent, IT_LOOKUP); + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; +Index: linux-2.6.10/fs/open.c +=================================================================== +--- linux-2.6.10.orig/fs/open.c ++++ linux-2.6.10/fs/open.c +@@ -216,12 +216,12 @@ static inline long do_sys_truncate(const + struct nameidata nd; + struct inode * inode; + int error; +- ++ intent_init(&nd.intent, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -475,6 +475,7 @@ asmlinkage long sys_access(const char __ + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ intent_init(&nd.intent, IT_GETATTR); + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -499,13 +500,14 @@ asmlinkage long sys_access(const char __ + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode, &nd); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ + path_release(&nd); + } + +@@ -520,8 +522,9 @@ asmlinkage long sys_chdir(const char __u + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -573,8 +576,9 @@ asmlinkage long sys_chroot(const char __ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -758,8 +762,10 @@ asmlinkage long sys_fchown(unsigned int + struct file *filp_open(const char * filename, int flags, int mode) + { + int namei_flags, error; ++ struct file * temp_filp; + struct nameidata nd; + ++ intent_init(&nd.intent, IT_OPEN); + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) + namei_flags++; +@@ -767,15 +773,26 @@ struct file *filp_open(const char * file + namei_flags |= 2; + + error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); +- ++ if (!error) { ++ temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent); ++ return temp_filp; ++ } + return ERR_PTR(error); + } + +-EXPORT_SYMBOL(filp_open); + + struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++ { ++ ++ struct lookup_intent it; ++ intent_init(&it, IT_LOOKUP); ++ ++ return dentry_open_it(dentry, mnt, flags, &it); ++} ++ ++EXPORT_SYMBOL(dentry_open); ++ ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags,struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -787,6 +805,7 @@ struct file *dentry_open(struct dentry * + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; ++ f->f_it = it; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); +@@ -805,6 +824,7 @@ struct file *dentry_open(struct dentry * + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; ++ intent_release(it); + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + +@@ -830,13 +850,12 @@ cleanup_all: + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + +-EXPORT_SYMBOL(dentry_open); +- + /* + * Find an empty file descriptor entry, and mark it busy. + */ +Index: linux-2.6.10/fs/stat.c +=================================================================== +--- linux-2.6.10.orig/fs/stat.c ++++ linux-2.6.10/fs/stat.c +@@ -38,7 +38,7 @@ void generic_fillattr(struct inode *inod + + EXPORT_SYMBOL(generic_fillattr); + +-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat) + { + struct inode *inode = dentry->d_inode; + int retval; +@@ -47,6 +47,8 @@ int vfs_getattr(struct vfsmount *mnt, st + if (retval) + return retval; + ++ if (inode->i_op->getattr_it) ++ return inode->i_op->getattr_it(mnt, dentry, it, stat); + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +@@ -63,14 +65,20 @@ int vfs_getattr(struct vfsmount *mnt, st + + EXPORT_SYMBOL(vfs_getattr); + ++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++{ ++ return vfs_getattr_it(mnt, dentry, NULL, stat); ++} ++ + int vfs_stat(char __user *name, struct kstat *stat) + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -82,10 +90,11 @@ int vfs_lstat(char __user *name, struct + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -97,9 +106,12 @@ int vfs_fstat(unsigned int fd, struct ks + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_GETATTR); + + if (f) { +- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat); ++ intent_release(&nd.intent); + fput(f); + } + return error; +Index: linux-2.6.10/include/linux/dcache.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dcache.h ++++ linux-2.6.10/include/linux/dcache.h +@@ -4,6 +4,7 @@ + #ifdef __KERNEL__ + + #include ++#include + #include + #include + #include +@@ -37,6 +38,8 @@ struct qstr { + const unsigned char *name; + }; + ++#include ++ + struct dentry_stat_t { + int nr_dentry; + int nr_unused; +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h ++++ linux-2.6.10/include/linux/fs.h +@@ -78,6 +78,7 @@ extern int dir_notify_enable; + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + /* Internal kernel extensions */ + #define FMODE_LSEEK 4 +@@ -262,6 +263,8 @@ typedef void (dio_iodone_t)(struct inode + #define ATTR_ATTR_FLAG 1024 + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 ++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -465,6 +468,7 @@ struct inode { + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; ++ void *i_filterdata; + + __u32 i_generation; + +@@ -600,6 +604,7 @@ struct file { + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct lookup_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -950,7 +955,9 @@ struct inode_operations { + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); ++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); +@@ -990,6 +997,7 @@ struct super_operations { + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); ++ void (*umount_lustre) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); + }; +@@ -1181,6 +1189,7 @@ extern int unregister_filesystem(struct + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); ++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + extern int vfs_statfs(struct super_block *, struct kstatfs *); +@@ -1245,6 +1254,7 @@ static inline int break_lease(struct ino + extern int do_truncate(struct dentry *, loff_t start); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linux-2.6.10/include/linux/mount.h +=================================================================== +--- linux-2.6.10.orig/include/linux/mount.h ++++ linux-2.6.10/include/linux/mount.h +@@ -36,6 +36,8 @@ struct vfsmount + struct list_head mnt_list; + struct list_head mnt_fslink; /* link in fs-specific expiry list */ + struct namespace *mnt_namespace; /* containing namespace */ ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h ++++ linux-2.6.10/include/linux/namei.h +@@ -2,14 +2,48 @@ + #define _LINUX_NAMEI_H + + #include ++#include + + struct vfsmount; ++struct nameidata; + +-struct open_intent { +- int flags; +- int create_mode; ++/* intent opcodes */ ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; + }; + ++#define INTENT_MAGIC 0x19620323 ++struct lookup_intent { ++ int it_magic; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_op; ++ int it_flags; ++ int it_create_mode; ++ union { ++ struct lustre_intent_data lustre; ++ } d; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++} ++ + enum { MAX_NESTED_LINKS = 8 }; + + struct nameidata { +@@ -21,10 +55,7 @@ struct nameidata { + unsigned depth; + char *saved_names[MAX_NESTED_LINKS + 1]; + +- /* Intent data */ +- union { +- struct open_intent open; +- } intent; ++ struct lookup_intent intent; + }; + + /* +@@ -47,6 +78,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + #define LOOKUP_NOALT 32 + #define LOOKUP_ATOMIC 64 + #define LOOKUP_REVAL 128 ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) + + /* + * Intent data +@@ -56,6 +89,12 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + #define LOOKUP_ACCESS (0x0400) + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)); ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) ++extern void intent_release(struct lookup_intent *); + #define user_path_walk(name,nd) \ + __user_walk(name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ +@@ -68,7 +107,6 @@ extern void path_release_on_umount(struc + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); +- + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); + diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index 695423b..1d87227 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -569,28 +569,6 @@ Index: linux-2.6.5-12.1/fs/stat.c fput(f); } -Index: linux-2.6.5-12.1/fs/nfs/dir.c -=================================================================== ---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400 -+++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-06-03 18:31:28.000000000 -0400 -@@ -709,7 +709,7 @@ - return 0; - if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) - return 0; -- return (nd->intent.open.flags & O_EXCL) != 0; -+ return (nd->intent.it_flags & O_EXCL) != 0; - } - - static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) -@@ -1026,7 +1026,7 @@ - attr.ia_valid = ATTR_MODE; - - if (nd && (nd->flags & LOOKUP_CREATE)) -- open_flags = nd->intent.open.flags; -+ open_flags = nd->intent.it_flags; - - /* - * The 0 argument passed into the create function should one day Index: linux-2.6.5-12.1/fs/inode.c =================================================================== --- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 12:21:56.000000000 -0400 diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch index 80db906..6edb8bd 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch @@ -580,28 +580,6 @@ Index: linux-2.6.12.5/fs/stat.c fput(f); } return error; -Index: linux-2.6.12.5/fs/nfs/dir.c -=================================================================== ---- linux-2.6.12.5.orig/fs/nfs/dir.c 2005-08-17 17:51:28.000000000 +0200 -+++ linux-2.6.12.5/fs/nfs/dir.c 2005-08-17 17:51:44.000000000 +0200 -@@ -727,7 +727,7 @@ - return 0; - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) - return 0; -- return (nd->intent.open.flags & O_EXCL) != 0; -+ return (nd->intent.it_flags & O_EXCL) != 0; - } - - static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) -@@ -1028,7 +1028,7 @@ - attr.ia_valid = ATTR_MODE; - - if (nd && (nd->flags & LOOKUP_CREATE)) -- open_flags = nd->intent.open.flags; -+ open_flags = nd->intent.it_flags; - - lock_kernel(); - nfs_begin_data_update(dir); Index: linux-2.6.12.5/fs/inode.c =================================================================== --- linux-2.6.12.5.orig/fs/inode.c 2005-08-17 17:51:28.000000000 +0200 diff --git a/lustre/kernel_patches/series/2.6-fc3.series b/lustre/kernel_patches/series/2.6-fc3.series index 361da69..90ada9a 100644 --- a/lustre/kernel_patches/series/2.6-fc3.series +++ b/lustre/kernel_patches/series/2.6-fc3.series @@ -1,7 +1,7 @@ uml-2.6.10-fc3.patch lustre_version.patch fc3_to_rhel4_updates.patch -vfs_intent-2.6-rhel4.patch +vfs_intent-2.6-fc3.patch vfs_nointent-2.6-rhel4.patch vfs_races-2.6-fc3.patch ext3-wantedi-misc-2.6-suse.patch diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index 6053eb7..5c48af2 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -20,3 +20,4 @@ linux-2.6-binutils-2.16.patch compile-fixes-2.6.9-rhel4-22.patch vm-tunables-rhel4.patch tcp-zero-copy-2.6.9-rhel4.patch +iallocsem_consistency.patch diff --git a/lustre/kernel_patches/series/2.6-suse-newer.series b/lustre/kernel_patches/series/2.6-suse-newer.series index 1c5d31f..4068bed 100644 --- a/lustre/kernel_patches/series/2.6-suse-newer.series +++ b/lustre/kernel_patches/series/2.6-suse-newer.series @@ -7,3 +7,5 @@ uml-exprt-clearuser.patch qsnet-suse-2.6.patch fsprivate-2.6.patch dcache-qstr-api-fix-2.6-suse.patch +iallocsem_consistency.patch +tcp-zero-copy-2.6.5-7.244.patch diff --git a/lustre/kernel_patches/series/2.6-suse.series b/lustre/kernel_patches/series/2.6-suse.series index 790361c..7a39b32 100644 --- a/lustre/kernel_patches/series/2.6-suse.series +++ b/lustre/kernel_patches/series/2.6-suse.series @@ -13,4 +13,3 @@ header-guards-2.6-suse.patch md_path_lookup-2.6-suse.patch ext3-super-ntohl.patch export-show_task-2.6-vanilla.patch -export-filemap_populate.patch diff --git a/lustre/kernel_patches/series/2.6.12-vanilla.series b/lustre/kernel_patches/series/2.6.12-vanilla.series index 9ecb127..cb41054 100644 --- a/lustre/kernel_patches/series/2.6.12-vanilla.series +++ b/lustre/kernel_patches/series/2.6.12-vanilla.series @@ -17,3 +17,4 @@ export-show_task-2.6-vanilla.patch sd_iostats-2.6-rhel4.patch fsprivate-2.6.patch export_symbol_numa.patch +tcp-zero-copy-2.6.12.6.patch diff --git a/lustre/ldiskfs/quotafmt_test.c b/lustre/ldiskfs/quotafmt_test.c index 5f6bc7c..0eb0647 100644 --- a/lustre/ldiskfs/quotafmt_test.c +++ b/lustre/ldiskfs/quotafmt_test.c @@ -45,13 +45,13 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi, int namelen = strlen(name); /* remove the stale test quotafile */ - down(&parent_inode->i_sem); + LOCK_INODE_MUTEX(parent_inode); de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen); if (!IS_ERR(de) && de->d_inode) vfs_unlink(parent_inode, de); if (!IS_ERR(de)) dput(de); - up(&parent_inode->i_sem); + UNLOCK_INODE_MUTEX(parent_inode); /* create quota file */ fp = filp_open(name, O_CREAT | O_EXCL, 0644); @@ -99,7 +99,7 @@ static int quotfmt_finalize(struct lustre_quota_info *lqi, filp_close(lqi->qi_files[i], 0); /* unlink quota file */ - down(&parent_inode->i_sem); + LOCK_INODE_MUTEX(parent_inode); de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen); if (IS_ERR(de) || de->d_inode == NULL) { @@ -116,7 +116,7 @@ static int quotfmt_finalize(struct lustre_quota_info *lqi, dput: if (!IS_ERR(de)) dput(de); - up(&parent_inode->i_sem); + UNLOCK_INODE_MUTEX(parent_inode); } pop_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 74fd6c5..6ee4f03 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -278,16 +278,14 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) ptlrpc_init_client(rq_portal, rp_portal, name, &obddev->obd_ldlm_client); - imp = class_new_import(); + imp = class_new_import(obddev); if (imp == NULL) GOTO(err_ldlm, rc = -ENOENT); imp->imp_client = &obddev->obd_ldlm_client; - imp->imp_obd = obddev; imp->imp_connect_op = connect_op; - imp->imp_generation = 0; imp->imp_initial_recov = 1; CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain); - memcpy(imp->imp_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), LUSTRE_CFG_BUFLEN(lcfg, 1)); class_import_put(imp); @@ -307,7 +305,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { CDEBUG(D_HA, "marking %s %s->%s as inactive\n", name, obddev->obd_name, - imp->imp_target_uuid.uuid); + cli->cl_target_uuid.uuid); imp->imp_invalid = 1; } } @@ -327,15 +325,7 @@ err: int client_obd_cleanup(struct obd_device *obddev) { - struct client_obd *cli = &obddev->u.cli; - ENTRY; - if (!cli->cl_import) - RETURN(-EINVAL); - class_destroy_import(cli->cl_import); - cli->cl_import = NULL; - client_obd_list_lock_done(&cli->cl_loi_list_lock); - ldlm_put_ref(obddev->obd_force); RETURN(0); @@ -454,11 +444,15 @@ int client_disconnect_export(struct obd_export *exp) } /* Yeah, obd_no_recov also (mainly) means "forced shutdown". */ - if (obd->obd_no_recov) - ptlrpc_invalidate_import(imp); - else + if (!obd->obd_no_recov) rc = ptlrpc_disconnect_import(imp); + ptlrpc_invalidate_import(imp); + imp->imp_deactive = 1; + ptlrpc_free_rq_pool(imp->imp_rq_pool); + class_destroy_import(imp); + cli->cl_import = NULL; + EXIT; out_no_disconnect: err = class_disconnect(exp); @@ -757,11 +751,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (export->exp_imp_reverse != NULL) class_destroy_import(export->exp_imp_reverse); - revimp = export->exp_imp_reverse = class_new_import(); + revimp = export->exp_imp_reverse = class_new_import(target); revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection); revimp->imp_client = &export->exp_obd->obd_ldlm_client; revimp->imp_remote_handle = conn; - revimp->imp_obd = target; revimp->imp_dlm_fake = 1; revimp->imp_state = LUSTRE_IMP_FULL; class_import_put(revimp); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 42ccef5..293733e 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -111,11 +111,7 @@ static int expired_lock_main(void *arg) struct l_wait_info lwi = { 0 }; ENTRY; - lock_kernel(); cfs_daemonize("ldlm_elt"); - cfs_block_allsigs(); - - unlock_kernel(); expired_lock_thread.elt_state = ELT_READY; cfs_waitq_signal(&expired_lock_thread.elt_waitq); @@ -184,9 +180,6 @@ static void waiting_locks_callback(unsigned long unused) { struct ldlm_lock *lock, *last = NULL; - if (obd_dump_on_timeout) - libcfs_debug_dumplog(); - spin_lock_bh(&waiting_locks_spinlock); while (!list_empty(&waiting_locks_list)) { lock = list_entry(waiting_locks_list.next, struct ldlm_lock, @@ -212,7 +205,6 @@ static void waiting_locks_callback(unsigned long unused) CFS_INIT_LIST_HEAD(&waiting_locks_list); /* HACK */ expired_lock_thread.elt_dump = __LINE__; - spin_unlock_bh(&waiting_locks_spinlock); /* LBUG(); */ CEMERG("would be an LBUG, but isn't (bug 5653)\n"); @@ -226,6 +218,11 @@ static void waiting_locks_callback(unsigned long unused) list_del(&lock->l_pending_chain); list_add(&lock->l_pending_chain, &expired_lock_thread.elt_expired_locks); + } + + if (!list_empty(&expired_lock_thread.elt_expired_locks)) { + if (obd_dump_on_timeout) + expired_lock_thread.elt_dump = __LINE__; cfs_waitq_signal(&expired_lock_thread.elt_waitq); } @@ -518,7 +515,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LASSERT(lock != NULL); do_gettimeofday(&granted_time); - total_enqueue_wait = cfs_timeval_sub(&granted_time,&lock->l_enqueued_time, NULL); + total_enqueue_wait = cfs_timeval_sub(&granted_time, + &lock->l_enqueued_time, NULL); if (total_enqueue_wait / 1000000 > obd_timeout) LDLM_ERROR(lock, "enqueue wait took %luus from %lu", @@ -1422,14 +1420,12 @@ static int ldlm_bl_thread_main(void *arg) struct ldlm_bl_pool *blp = bltd->bltd_blp; ENTRY; - /* XXX boiler-plate */ { char name[CFS_CURPROC_COMM_MAX]; snprintf(name, sizeof(name) - 1, "ldlm_bl_%02d", bltd->bltd_num); cfs_daemonize(name); } - cfs_block_allsigs(); atomic_inc(&blp->blp_num_threads); complete(&blp->blp_comp); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 6da730d..bf2f655 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -40,7 +40,7 @@ static void interrupted_completion_wait(void *data) struct lock_wait_data { struct ldlm_lock *lwd_lock; - int lwd_generation; + __u32 lwd_conn_cnt; }; int ldlm_expired_completion_wait(void *data) @@ -70,11 +70,10 @@ int ldlm_expired_completion_wait(void *data) obd = lock->l_conn_export->exp_obd; imp = obd->u.cli.cl_import; - ptlrpc_fail_import(imp, lwd->lwd_generation); + ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); LDLM_ERROR(lock, "lock timed out (enqueued %lus ago), entering " "recovery for %s@%s", lock->l_enqueued_time.tv_sec, - imp->imp_target_uuid.uuid, - imp->imp_connection->c_remote_uuid.uuid); + obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); RETURN(0); } @@ -117,8 +116,7 @@ noreproc: lwd.lwd_lock = lock; if (unlikely(flags & LDLM_FL_NO_TIMEOUT)) { - LDLM_DEBUG(lock, "waiting indefinitely because CW lock was" - " met\n"); + LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); lwi = LWI_INTR(interrupted_completion_wait, &lwd); } else { lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), @@ -128,7 +126,7 @@ noreproc: if (imp != NULL) { spin_lock_irqsave(&imp->imp_lock, irqflags); - lwd.lwd_generation = imp->imp_generation; + lwd.lwd_conn_cnt = imp->imp_conn_cnt; spin_unlock_irqrestore(&imp->imp_lock, irqflags); } @@ -452,9 +450,11 @@ int ldlm_cli_enqueue(struct obd_export *exp, /* lock enqueued on the server */ cleanup_phase = 1; + l_lock(&ns->ns_lock); lock->l_remote_handle = reply->lock_handle; *flags = reply->lock_flags; lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS; + l_unlock(&ns->ns_lock); CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%x\n", lock, reply->lock_handle.cookie, *flags); @@ -1101,7 +1101,9 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) /* we use l_pending_chain here, because it's unused on clients. */ LASSERTF(list_empty(&lock->l_pending_chain),"lock %p next %p prev %p\n", lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev); - list_add(&lock->l_pending_chain, list); + /* bug 9573: don't replay locks left after eviction */ + if (!(lock->l_flags & LDLM_FL_FAILED)) + list_add(&lock->l_pending_chain, list); return LDLM_ITER_CONTINUE; } diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 72b1c44..054fa0d 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -130,9 +130,9 @@ int liblustre_process_log(struct config_llog_instance *cfg, ocd->ocd_version = LUSTRE_VERSION_CODE; /* Disable initial recovery on this import */ - rc = obd_set_info(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", - sizeof(allow_recov), &allow_recov); + rc = obd_set_info_async(obd->obd_self_export, + strlen("initial_recov"), "initial_recov", + sizeof(allow_recov), &allow_recov, NULL); rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd); if (rc) { @@ -242,17 +242,10 @@ int _sysio_lustre_init(void) { int err; char *timeout = NULL; - char *debug_mask = NULL; - char *debug_subsys = NULL; #ifndef INIT_SYSIO extern void __liblustre_cleanup_(void); #endif -#if 0 - libcfs_debug = -1; - libcfs_subsystem_debug = -1; -#endif - liblustre_init_random(); err = lllib_init(); @@ -267,16 +260,6 @@ int _sysio_lustre_init(void) obd_timeout); } - /* debug masks */ - debug_mask = getenv("LIBLUSTRE_DEBUG_MASK"); - if (debug_mask) - libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0); - - debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS"); - if (debug_subsys) - libcfs_subsystem_debug = - (unsigned int) strtol(debug_subsys, NULL, 0); - #ifndef INIT_SYSIO (void)atexit(__liblustre_cleanup_); #endif diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 07f3934..8f47209 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -347,6 +347,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, struct intnl_stat *st; ENTRY; + if (it_disposition(it, DISP_OPEN_CREATE)) + ptlrpc_req_finished(request); + rc = mdc_req2lustre_md(request, offset, sbi->ll_osc_exp, &md); if (rc) RETURN(rc); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index e01e2f2..d262d5c 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -820,11 +820,11 @@ static int llu_iop_setattr(struct pnode *pno, } if (mask & SETATTR_MTIME) { iattr.ia_mtime = stbuf->st_mtime; - iattr.ia_valid |= ATTR_MTIME; + iattr.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; } if (mask & SETATTR_ATIME) { iattr.ia_atime = stbuf->st_atime; - iattr.ia_valid |= ATTR_ATIME; + iattr.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; } if (mask & SETATTR_UID) { iattr.ia_uid = stbuf->st_uid; @@ -1764,8 +1764,8 @@ llu_fsswop_mount(const char *source, CERROR("MDC %s: not setup or attached\n", mdc); GOTO(out_free, err = -EINVAL); } - obd_set_info(obd->obd_self_export, strlen("async"), "async", - sizeof(async), &async); + obd_set_info_async(obd->obd_self_export, strlen("async"), "async", + sizeof(async), &async, NULL); ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION; ocd.ocd_ibits_known = MDS_INODELOCK_FULL; @@ -1793,8 +1793,8 @@ llu_fsswop_mount(const char *source, CERROR("OSC %s: not setup or attached\n", osc); GOTO(out_mdc, err = -EINVAL); } - obd_set_info(obd->obd_self_export, strlen("async"), "async", - sizeof(async), &async); + obd_set_info_async(obd->obd_self_export, strlen("async"), "async", + sizeof(async), &async, NULL); obd->obd_upcall.onu_owner = &sbi->ll_lco; obd->obd_upcall.onu_upcall = ll_ocd_update; diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c index bd3c04f..ccab0c3 100644 --- a/lustre/liblustre/tests/sanity.c +++ b/lustre/liblustre/tests/sanity.c @@ -49,6 +49,7 @@ void *buf_alloc; int buf_size; int opt_verbose; +struct timeval start; extern char *lustre_path; @@ -64,17 +65,23 @@ extern char *lustre_path; buf[80] = 0; \ } \ printf("%s", buf); \ + gettimeofday(&start, NULL); \ } while (0) #define LEAVE() \ do { \ - char buf[100]; \ - int len; \ - sprintf(buf, "===== END TEST %s: successfully ", \ - __FUNCTION__); \ - len = strlen(buf); \ + struct timeval stop; \ + char buf[100] = { '\0' }; \ + int len = sizeof(buf) - 1; \ + long usec; \ + gettimeofday(&stop, NULL); \ + usec = (stop.tv_sec - start.tv_sec) * 1000000 + \ + (stop.tv_usec - start.tv_usec); \ + len = snprintf(buf, len, \ + "===== END TEST %s: successfully (%gs)", \ + __FUNCTION__, (double)usec / 1000000); \ if (len < 79) { \ - memset(buf+len, '=', 100-len); \ + memset(buf+len, '=', sizeof(buf) - len); \ buf[79] = '\n'; \ buf[80] = 0; \ } \ @@ -1078,15 +1085,90 @@ int t52(char *name) LEAVE(); } +#define NEW_TIME 10000 +int t53(char *name) +{ + char file[MAX_PATH_LENGTH] = ""; + struct utimbuf times; /* struct. buffer for utime() */ + struct stat stat_buf; /* struct buffer to hold file info. */ + time_t mtime, atime; + + ENTRY("mtime/atime should be updated by utime() call"); + snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path); + + t_echo_create(file, "check mtime/atime update by utime() call"); + + /* Initialize the modification and access time in the times arg */ + times.actime = NEW_TIME+10; + times.modtime = NEW_TIME; + + /* file modification/access time */ + utime(file, ×); + + if (stat(file, &stat_buf) < 0) { + printf("stat(2) of %s failed, error:%d %s\n", + file, errno, strerror(errno)); + } + mtime = stat_buf.st_mtime; + atime = stat_buf.st_atime; + + if ((mtime == NEW_TIME) && (atime == NEW_TIME + 10)) { + t_unlink(file); + LEAVE(); + } + + printf("mod time %ld, expected %ld\n", mtime, (long)NEW_TIME); + printf("acc time %ld, expected %ld\n", atime, (long)NEW_TIME + 10); + + t_unlink(file); + return (-1); +} + +int t54(char *name) +{ + char file[MAX_PATH_LENGTH] = ""; + struct flock lock; + int fd, err; + + ENTRY("fcntl should return 0 when succeed in getting flock"); + snprintf(file, MAX_PATH_LENGTH, "%s/test_t54_file", lustre_path); + + t_echo_create(file, "fcntl should return 0 when succeed"); + + fd = open(file, O_RDWR); + if (fd < 0) { + printf("\nerror open file: %s\n", strerror(errno)); + return(-1); + } + lock.l_type = F_WRLCK; + lock.l_start = 0; + lock.l_whence = 0; + lock.l_len = 1; + if ((err = t_fcntl(fd, F_SETLKW, &lock)) != 0) { + fprintf(stderr, "fcntl returned: %d (%s)\n", + err, strerror(err)); + close(fd); + t_unlink(file); + return (-1); + } + + lock.l_type = F_UNLCK; + t_fcntl(fd, F_SETLKW, &lock); + close(fd); + t_unlink(file); + LEAVE(); +} + extern void __liblustre_setup_(void); extern void __liblustre_cleanup_(void); void usage(char *cmd) { - printf("\n"); - printf("Usage: \t%s --target mdsnid:/mdsname/profile\n", cmd); - printf(" \t%s --dumpfile dumpfile\n", cmd); + printf("\n" + "usage: %s [--only {test}] --target mdsnid:/mdsname/profile\n", + cmd); + printf(" %s --dumpfile dumpfile\n", cmd); exit(-1); } @@ -1121,6 +1203,8 @@ struct testlist { { t50, "50" }, { t50b, "50b" }, { t51, "51" }, + { t53, "53" }, + { t54, "54" }, { NULL, NULL } }; @@ -1189,12 +1273,21 @@ int main(int argc, char * const argv[]) run = 0; len = strlen(test->name); for (i = 0; i < numonly; i++) { - if (len < strlen(only[i])) + int olen = strlen(only[i]); + + if (len < olen) continue; - if (strncmp(only[i], test->name, - strlen(only[i])) == 0) { - run = 1; - break; + + if (strncmp(only[i], test->name, olen) == 0) { + switch(test->name[olen]) { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': + break; + default: + run = 1; + break; + } } } } diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 7a908d7..64bcd19 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -1,5 +1,5 @@ MODULES := llite -llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o xattr.o +llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o symlink.o llite_mmap.o xattr.o ifeq ($(PATCHLEVEL),4) llite-objs += rw24.o super.o diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index d35cef3..7b5fa8c 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -146,7 +146,6 @@ void ll_intent_release(struct lookup_intent *it) void ll_unhash_aliases(struct inode *inode) { struct list_head *tmp, *head; - struct ll_sb_info *sbi; ENTRY; if (inode == NULL) { @@ -157,7 +156,6 @@ void ll_unhash_aliases(struct inode *inode) CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n", inode->i_ino, inode->i_generation, inode); - sbi = ll_i2sbi(inode); head = &inode->i_dentry; restart: spin_lock(&dcache_lock); @@ -207,7 +205,7 @@ restart: #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) __d_drop(dentry); hlist_add_head(&dentry->d_hash, - &sbi->ll_orphan_dentry_list); + &ll_i2sbi(inode)->ll_orphan_dentry_list); #endif } unlock_dentry(dentry); @@ -220,7 +218,6 @@ static int revalidate_it_finish(struct ptlrpc_request *request, int offset, struct lookup_intent *it, struct dentry *de) { - struct ll_sb_info *sbi; int rc = 0; ENTRY; @@ -230,8 +227,8 @@ static int revalidate_it_finish(struct ptlrpc_request *request, int offset, if (it_disposition(it, DISP_LOOKUP_NEG)) RETURN(-ENOENT); - sbi = ll_i2sbi(de->d_inode); - rc = ll_prep_inode(sbi->ll_osc_exp, &de->d_inode, request, offset,NULL); + rc = ll_prep_inode(ll_i2sbi(de->d_inode)->ll_osc_exp, &de->d_inode, + request, offset,NULL); RETURN(rc); } @@ -331,6 +328,11 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, ll_intent_release(it); GOTO(out, rc = 0); } + if ((it->it_op & IT_OPEN) && de->d_inode && + !S_ISREG(de->d_inode->i_mode) && + !S_ISDIR(de->d_inode->i_mode)) { + ll_release_openhandle(de, it); + } rc = 1; /* unfortunately ll_intent_lock may cause a callback and revoke our diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index fe8be86..9c1588b 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -113,9 +113,8 @@ static inline unsigned long dir_pages(struct inode *inode) } -static void ext2_check_page(struct page *page) +static void ext2_check_page(struct inode *dir, struct page *page) { - struct inode *dir = page->mapping->host; unsigned chunk_size = ext2_chunk_size(dir); char *kaddr = page_address(page); // u32 max_inumber = le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count); @@ -164,10 +163,9 @@ out: /* Too bad, we had an error */ Ebadsize: - CERROR("ext2_check_page" - "size of directory #%lu is not a multiple of chunk size\n", - dir->i_ino - ); + CERROR("%s: directory %lu/%u size %llu is not a multiple of %u\n", + ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino, + dir->i_generation, dir->i_size, chunk_size); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -184,10 +182,11 @@ Espan: //Einumber: // error = "inode out of bounds"; bad_entry: - CERROR("ext2_check_page: bad entry in directory #%lu: %s - " + CERROR("%s: bad entry in directory %lu/%u: %s - " "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<inode), + ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino, + dir->i_generation, error, (page->index<inode), rec_len, p->name_len); goto fail; Eend: @@ -239,16 +238,17 @@ static struct page *ll_get_dir_page(struct inode *dir, unsigned long n) page = read_cache_page(mapping, n, (filler_t*)mapping->a_ops->readpage, NULL); - if (!IS_ERR(page)) { - wait_on_page(page); - (void)kmap(page); - if (!PageUptodate(page)) - goto fail; - if (!PageChecked(page)) - ext2_check_page(page); - if (PageError(page)) - goto fail; - } + if (IS_ERR(page)) + GOTO(out_unlock, page); + + wait_on_page(page); + (void)kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + ext2_check_page(dir, page); + if (PageError(page)) + goto fail; out_unlock: ldlm_lock_decref(&lockh, LCK_CR); @@ -290,7 +290,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { }; -int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) +int ll_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; loff_t pos = filp->f_pos; @@ -332,6 +332,7 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) kaddr = page_address(page); if (need_revalidate) { + /* page already checked from ll_get_dir_page() */ offset = ext2_validate_entry(kaddr, offset, chunk_mask); need_revalidate = 0; } @@ -361,7 +362,8 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) done: filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; filp->f_version = inode->i_version; - update_atime(inode); + touch_atime(filp->f_vfsmnt, filp->f_dentry); + RETURN(rc); } @@ -823,9 +825,8 @@ out_free_memmd: /* XXX: dqb_valid is borrowed as a flag to mark that * only mds quota is wanted */ if (qctl->qc_dqblk.dqb_valid) - qctl->obd_uuid = - sbi->ll_mdc_exp->exp_obd->u.cli. - cl_import->imp_target_uuid; + qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd-> + u.cli.cl_target_uuid; break; case Q_GETINFO: break; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index cdd43e2..b8e10e8 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -47,35 +47,29 @@ static void ll_file_data_put(struct ll_file_data *fd) OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd); } -int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, - struct file *file) +static int ll_close_inode_openhandle(struct inode *inode, + struct obd_client_handle *och) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); struct ptlrpc_request *req = NULL; - struct obd_client_handle *och = &fd->fd_mds_och; - struct obdo obdo; + struct obdo *oa; int rc; - ENTRY; - /* clear group lock, if present */ - if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); - rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, - &fd->fd_cwlockh); - } - - obdo.o_id = inode->i_ino; - obdo.o_valid = OBD_MD_FLID; - obdo_from_inode(&obdo, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLATIME | OBD_MD_FLMTIME | - OBD_MD_FLCTIME); + oa = obdo_alloc(); + if (!oa) + RETURN(-ENOMEM); // XXX We leak openhandle and request here. + + oa->o_id = inode->i_ino; + oa->o_valid = OBD_MD_FLID; + obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME); if (0 /* ll_is_inode_dirty(inode) */) { - obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; - obdo.o_valid |= OBD_MD_FLFLAGS; + oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; + oa->o_valid |= OBD_MD_FLFLAGS; } - rc = mdc_close(mdc_exp, &obdo, och, &req); + + rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req); if (rc == EAGAIN) { /* We are the last writer, so the MDS has instructed us to get * the file size and any write cookies, then close again. */ @@ -85,15 +79,39 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, CERROR("inode %lu mdc close failed: rc = %d\n", inode->i_ino, rc); } + + obdo_free(oa); + if (rc == 0) { - rc = ll_objects_destroy(req, file->f_dentry->d_inode); + rc = ll_objects_destroy(req, inode); if (rc) CERROR("inode %lu ll_objects destroy: rc = %d\n", inode->i_ino, rc); } mdc_clear_open_replay_data(och); - ptlrpc_req_finished(req); + ptlrpc_req_finished(req); /* This is close request */ + + RETURN(rc); +} + +int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, + struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct obd_client_handle *och = &fd->fd_mds_och; + int rc; + ENTRY; + + /* clear group lock, if present */ + if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); + rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, + &fd->fd_cwlockh); + } + + rc = ll_close_inode_openhandle(inode, och); och->och_fh.cookie = DEAD_HANDLE_MAGIC; LUSTRE_FPRIVATE(file) = NULL; ll_file_data_put(fd); @@ -155,36 +173,50 @@ static int ll_intent_file_open(struct file *file, void *lmm, rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, itp, LCK_PW, &data, &lockh, lmm, lmmsize, ldlm_completion_ast, ll_mdc_blocking_ast, NULL, 0); - if (rc < 0) + if (rc < 0) { CERROR("lock enqueue: err: %d\n", rc); + GOTO(out, rc); + } + + rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode, + (struct ptlrpc_request *)itp->d.lustre.it_data, 1, + NULL); +out: RETURN(rc); } -int ll_local_open(struct file *file, struct lookup_intent *it, - struct ll_file_data *fd) +static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it, + struct obd_client_handle *och) { struct ptlrpc_request *req = it->d.lustre.it_data; - struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode); struct mds_body *body; - ENTRY; - body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body)); - LASSERT (body != NULL); /* reply already checked out */ - LASSERT_REPSWABBED (req, 1); /* and swabbed down */ + LASSERT(och); + + body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body)); + LASSERT(body != NULL); /* reply already checked out */ + LASSERT_REPSWABBED(req, 1); /* and swabbed in mdc_enqueue */ + + memcpy(&och->och_fh, &body->handle, sizeof(body->handle)); + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + lli->lli_io_epoch = body->io_epoch; + + mdc_set_open_replay_data(och, it->d.lustre.it_data); +} + +int ll_local_open(struct file *file, struct lookup_intent *it, + struct ll_file_data *fd) +{ + ENTRY; LASSERT(!LUSTRE_FPRIVATE(file)); LASSERT(fd != NULL); - memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle)); - fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC; + ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, &fd->fd_mds_och); LUSTRE_FPRIVATE(file) = fd; ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras); - lli->lli_io_epoch = body->io_epoch; - - mdc_set_open_replay_data(&fd->fd_mds_och, it->d.lustre.it_data); - RETURN(0); } @@ -228,6 +260,21 @@ int ll_file_open(struct inode *inode, struct file *file) RETURN(-ENOMEM); if (!it || !it->d.lustre.it_disposition) { + /* Convert f_flags into access mode. We cannot use file->f_mode, + * because everything but O_ACCMODE mask was stripped from + * there */ + if ((oit.it_flags + 1) & O_ACCMODE) + oit.it_flags++; + if (oit.it_flags & O_TRUNC) + oit.it_flags |= FMODE_WRITE; + + if (oit.it_flags & O_CREAT) + oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; + + /* We do not want O_EXCL here, presumably we opened the file + * already? XXX - NFS implications? */ + oit.it_flags &= ~O_EXCL; + it = &oit; rc = ll_intent_file_open(file, NULL, 0, it); if (rc) { @@ -596,14 +643,14 @@ int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data) lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size; l_lock(&lock->l_resource->lr_namespace->ns_lock); - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size); kms = ldlm_extent_shift_kms(NULL, kms); if (lsm->lsm_oinfo[stripe].loi_kms != kms) LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64, lsm->lsm_oinfo[stripe].loi_kms, kms); lsm->lsm_oinfo[stripe].loi_kms = kms; - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); l_unlock(&lock->l_resource->lr_namespace->ns_lock); } @@ -887,7 +934,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, /* A glimpse is necessary to determine whether we return a * short read (B) or some zeroes at the end of the buffer (C) */ ll_inode_size_unlock(inode, 1); - retval = ll_glimpse_size(inode, 0); + retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); if (retval) goto out; } else { @@ -963,7 +1010,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, if (rc != 0) RETURN(rc); - /* this is ok, g_f_w will overwrite this under i_sem if it races + /* this is ok, g_f_w will overwrite this under i_mutex if it races * with a local truncate, it just makes our maxbyte checking easier */ if (file->f_flags & O_APPEND) *ppos = inode->i_size; @@ -980,7 +1027,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", inode->i_ino, count, *ppos); - /* generic_file_write handles O_APPEND after getting i_sem */ + /* generic_file_write handles O_APPEND after getting i_mutex */ retval = generic_file_write(file, buf, count, ppos); out: @@ -990,6 +1037,102 @@ out: RETURN(retval); } +/* + * Send file content (through pagecache) somewhere with helper + */ +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count, + read_actor_t actor, void *target) +{ + struct inode *inode = in_file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + struct ll_lock_tree tree; + struct ll_lock_tree_node *node; + struct ost_lvb lvb; + struct ll_ra_read bead; + int rc; + ssize_t retval; + __u64 kms; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", + inode->i_ino, inode->i_generation, inode, count, *ppos); + + /* "If nbyte is 0, read() will return 0 and have no other results." + * -- Single Unix Spec */ + if (count == 0) + RETURN(0); + + lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES, + count); + + /* File with no objects, nothing to lock */ + if (!lsm) + RETURN(generic_file_sendfile(in_file, ppos, count, actor, target)); + + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR); + tree.lt_fd = LUSTRE_FPRIVATE(in_file); + rc = ll_tree_lock(&tree, node, NULL, count, + in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0); + if (rc != 0) + RETURN(rc); + + ll_inode_size_lock(inode, 1); + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being read and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock acquired by + * ll_tree_lock() above, because to change class, other client has to + * take DLM lock conflicting with our lock. Also, any updates to + * ->i_size by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1); + kms = lvb.lvb_size; + if (*ppos + count - 1 > kms) { + /* A glimpse is necessary to determine whether we return a + * short read (B) or some zeroes at the end of the buffer (C) */ + ll_inode_size_unlock(inode, 1); + retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); + if (retval) + goto out; + } else { + /* region is within kms and, hence, within real file size (A) */ + inode->i_size = kms; + ll_inode_size_unlock(inode, 1); + } + + CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", + inode->i_ino, count, *ppos, inode->i_size); + + /* turn off the kernel's read-ahead */ + in_file->f_ra.ra_pages = 0; + + bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; + bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + ll_ra_read_in(in_file, &bead); + /* BUG: 5972 */ + file_accessed(in_file); + retval = generic_file_sendfile(in_file, ppos, count, actor, target); + ll_ra_read_ex(in_file, &bead); + + out: + ll_tree_unlock(&tree); + RETURN(retval); +} +#endif + static int ll_lov_recreate_obj(struct inode *inode, struct file *file, unsigned long arg) { @@ -1077,8 +1220,8 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, if (!f) GOTO(out, -ENOMEM); - f->f_dentry = file->f_dentry; - f->f_vfsmnt = file->f_vfsmnt; + f->f_dentry = dget(file->f_dentry); + f->f_vfsmnt = mntget(file->f_vfsmnt); rc = ll_intent_file_open(f, lum, lum_size, &oit); if (rc) @@ -1106,7 +1249,7 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, out: if (f) - put_filp(f); + fput(f); ll_file_data_put(fd); up(&lli->lli_open_sem); if (req != NULL) @@ -1297,8 +1440,8 @@ static int join_file(struct inode *head_inode, struct file *head_filp, if (f == NULL) GOTO(out, rc = -ENOMEM); - f->f_dentry = head_filp->f_dentry; - f->f_vfsmnt = head_filp->f_vfsmnt; + f->f_dentry = dget(head_filp->f_dentry); + f->f_vfsmnt = mntget(head_filp->f_vfsmnt); ll_prepare_mdc_op_data(op_data, head_inode, tail_parent, tail_dentry->d_name.name, @@ -1327,7 +1470,7 @@ out: if (op_data) OBD_FREE_PTR(op_data); if (f) - put_filp(f); + fput(f); ll_file_data_put(fd); ptlrpc_req_finished(req); RETURN(rc); @@ -1336,7 +1479,7 @@ out: static int ll_file_join(struct inode *head, struct file *filp, char *filename_tail) { - struct inode *tail = NULL, *first, *second; + struct inode *tail = NULL, *first = NULL, *second = NULL; struct dentry *tail_dentry; struct file *tail_filp, *first_filp, *second_filp; struct ll_lock_tree first_tree, second_tree; @@ -1426,6 +1569,38 @@ cleanup: RETURN(rc); } +int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it) +{ + struct inode *inode = dentry->d_inode; + struct obd_client_handle *och; + int rc; + ENTRY; + + LASSERT(inode); + + /* Root ? Do nothing. */ + if (dentry->d_inode->i_sb->s_root == dentry) + RETURN(0); + + /* No open handle to close? Move away */ + if (!it_disposition(it, DISP_OPEN_OPEN)) + RETURN(0); + + OBD_ALLOC(och, sizeof(*och)); + if (!och) + GOTO(out, rc = -ENOMEM); + + ll_och_fill(ll_i2info(inode), it, och); + + rc = ll_close_inode_openhandle(inode, och); + + OBD_FREE(och, sizeof(*och)); + out: + /* this one is in place of ll_file_open */ + ptlrpc_req_finished(it->d.lustre.it_data); + RETURN(rc); +} + int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { @@ -1454,10 +1629,18 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, if (get_user(flags, (int *) arg)) RETURN(-EFAULT); - if (cmd == LL_IOC_SETFLAGS) + if (cmd == LL_IOC_SETFLAGS) { + if ((flags & LL_FILE_IGNORE_LOCK) && + !(file->f_flags & O_DIRECT)) { + CERROR("%s: unable to disable locking on " + "non-O_DIRECT file\n", current->comm); + RETURN(-EINVAL); + } + fd->fd_flags |= flags; - else + } else { fd->fd_flags &= ~flags; + } RETURN(0); case LL_IOC_LOV_SETSTRIPE: RETURN(ll_lov_setstripe(inode, file, arg)); @@ -1754,6 +1937,18 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) } ll_inode2fid(&fid, inode); rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req); + if (rc == -ENOENT) { /* Already unlinked. Just update nlink + * and return success */ + inode->i_nlink = 0; + /* This path cannot be hit for regular files unless in + * case of obscure races, so * no need to to validate + * size. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISDIR(inode->i_mode)) + RETURN(0); + } + if (rc) { CERROR("failure %d inode %lu\n", rc, inode->i_ino); RETURN(-abs(rc)); @@ -1777,8 +1972,8 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) } #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -int ll_getattr(struct vfsmount *mnt, struct dentry *de, - struct lookup_intent *it, struct kstat *stat) +int ll_getattr_it(struct vfsmount *mnt, struct dentry *de, + struct lookup_intent *it, struct kstat *stat) { struct inode *inode = de->d_inode; int res = 0; @@ -1808,6 +2003,12 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, return 0; } +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) +{ + struct lookup_intent it = { .it_op = IT_GETATTR }; + + return ll_getattr_it(mnt, de, &it, stat); +} #endif static @@ -1901,7 +2102,7 @@ struct file_operations ll_file_operations = { .mmap = ll_file_mmap, .llseek = ll_file_seek, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - .sendfile = generic_file_sendfile, + .sendfile = ll_file_sendfile, #endif .fsync = ll_fsync, /* .lock = ll_file_flock */ @@ -1916,7 +2117,7 @@ struct file_operations ll_file_operations_flock = { .mmap = ll_file_mmap, .llseek = ll_file_seek, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - .sendfile = generic_file_sendfile, + .sendfile = ll_file_sendfile, #endif .fsync = ll_fsync, .lock = ll_file_flock @@ -1928,7 +2129,7 @@ struct inode_operations ll_file_inode_operations = { .setattr = ll_setattr, .truncate = ll_truncate, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - .getattr_it = ll_getattr, + .getattr_it = ll_getattr_it, #else .revalidate_it = ll_inode_revalidate_it, #endif diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index 29bce2e..1333abb 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -194,18 +194,12 @@ static int ll_close_thread(void *arg) struct ll_close_queue *lcq = arg; ENTRY; - /* XXX boiler-plate */ { - char name[sizeof(current->comm)]; - unsigned long flags; + char name[CFS_CURPROC_COMM_MAX]; snprintf(name, sizeof(name) - 1, "ll_close"); cfs_daemonize(name); - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); } - + complete(&lcq->lcq_comp); while (1) { diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 9a47016..68c8658 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -78,11 +78,6 @@ struct ll_inode_info { /* for writepage() only to communicate to fsync */ int lli_async_rc; - struct file_operations *ll_save_ifop; - struct file_operations *ll_save_ffop; - struct file_operations *ll_save_wfop; - struct file_operations *ll_save_wrfop; - struct posix_acl *lli_posix_acl; struct list_head lli_dead_list; @@ -117,6 +112,10 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode) /* default to about 40meg of readahead on a given system. That much tied * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */ #define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT)) + +/* default to read-ahead full files smaller than 2MB on the second read */ +#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT)) + enum ra_stat { RA_STAT_HIT = 0, RA_STAT_MISS, @@ -129,12 +128,14 @@ enum ra_stat { RA_STAT_ZERO_WINDOW, RA_STAT_EOF, RA_STAT_MAX_IN_FLIGHT, + RA_STAT_WRONG_GRAB_PAGE, _NR_RA_STAT, }; struct ll_ra_info { unsigned long ra_cur_pages; unsigned long ra_max_pages; + unsigned long ra_max_read_ahead_whole_pages; unsigned long ra_stats[_NR_RA_STAT]; }; @@ -211,7 +212,13 @@ struct ll_readahead_state { * case, it probably doesn't make sense to expand window to * PTLRPC_MAX_BRW_PAGES on the third access. */ - unsigned long ras_consecutive; + unsigned long ras_consecutive_pages; + /* + * number of read requests after the last read-ahead window reset + * As window is reset on each seek, this is effectively the number + * on consecutive read request and is used to trigger read-ahead. + */ + unsigned long ras_consecutive_requests; /* * Parameters of current read-ahead window. Handled by * ras_update(). On the initial access to the file or after a seek, @@ -229,6 +236,17 @@ struct ll_readahead_state { */ unsigned long ras_next_readahead; /* + * Total number of ll_file_read requests issued, reads originating + * due to mmap are not counted in this total. This value is used to + * trigger full file read-ahead after multiple reads to a small file. + */ + unsigned long ras_requests; + /* + * Page index with respect to the current request, these value + * will not be accurate when dealing with reads issued via mmap. + */ + unsigned long ras_request_index; + /* * list of struct ll_ra_read's one per read(2) call current in * progress against this file descriptor. Used by read-ahead code, * protected by ->ras_lock. @@ -377,11 +395,13 @@ int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *); int ll_glimpse_size(struct inode *inode, int ast_flags); int ll_local_open(struct file *file, struct lookup_intent *it, struct ll_file_data *fd); +int ll_release_openhandle(struct dentry *, struct lookup_intent *); int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, struct file *file); #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -int ll_getattr(struct vfsmount *mnt, struct dentry *de, +int ll_getattr_it(struct vfsmount *mnt, struct dentry *de, struct lookup_intent *it, struct kstat *stat); +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); #endif struct ll_file_data *ll_file_data_get(void); #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) @@ -433,6 +453,7 @@ int ll_obd_statfs(struct inode *inode, void *arg); int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); /* llite/llite_nfs.c */ +extern struct export_operations lustre_export_operations; __u32 get_uuid2int(const char *name, int len); struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len, int fhtype, int parent); @@ -493,7 +514,6 @@ int ll_tree_unlock(struct ll_lock_tree *tree); #define LL_MAX_BLKSIZE (4UL * 1024 * 1024) #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->s_fs_info)) #define ll_s2sbi_nocast(sb) ((sb)->s_fs_info) void __d_rehash(struct dentry * entry, int lock); static inline __u64 ll_ts2u64(struct timespec *time) @@ -502,13 +522,13 @@ static inline __u64 ll_ts2u64(struct timespec *time) return t; } #else /* 2.4 here */ -#define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->u.generic_sbp)) #define ll_s2sbi_nocast(sb) ((sb)->u.generic_sbp) static inline __u64 ll_ts2u64(time_t *time) { return *time; } #endif +#define ll_s2sbi(sb) ((struct ll_sb_info *)ll_s2sbi_nocast(sb)) /* don't need an addref as the sb_info should be holding one */ static inline struct obd_export *ll_s2obdexp(struct super_block *sb) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 3e79031..90dd73c 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -92,6 +92,8 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) sbi->ll_async_page_max = (num_physpages / 4) * 3; sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8, SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + SBI_DEFAULT_READAHEAD_WHOLE_MAX; INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); @@ -162,7 +164,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) } /* indicate that inodebits locking is supported by this client */ - data->ocd_connect_flags |= OBD_CONNECT_IBITS; + data->ocd_connect_flags |= OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH; data->ocd_ibits_known = MDS_INODELOCK_FULL; if (sb->s_flags & MS_RDONLY) @@ -230,18 +232,18 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) * on all clients. */ /* s_dev is also used in lt_compare() to compare two fs, but that is * only a node-local comparison. */ - sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid, - strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid)); + sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid, + strlen(sbi2mdc(sbi)->cl_target_uuid.uuid)); #endif obd = class_name2obd(osc); if (!obd) { CERROR("OSC %s: not setup or attached\n", osc); - GOTO(out_mdc, err); + GOTO(out_mdc, err = -ENODEV); } data->ocd_connect_flags = - OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL; + OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL; CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d " "ocd_grant: %d\n", data->ocd_connect_flags, @@ -288,6 +290,9 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) sbi->ll_rootino = rootfid.id; sb->s_op = &lustre_super_operations; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + sb->s_export_op = &lustre_export_operations; +#endif /* make root inode * XXX: move this to after cbd setup? */ @@ -731,9 +736,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char * profile, } /* Try all connections, but only once. */ - rc = obd_set_info(obd->obd_self_export, - strlen("init_recov_bk"), "init_recov_bk", - sizeof(recov_bk), &recov_bk); + rc = obd_set_info_async(obd->obd_self_export, + strlen("init_recov_bk"), "init_recov_bk", + sizeof(recov_bk), &recov_bk, NULL); if (rc) GOTO(out_cleanup, rc); @@ -761,13 +766,17 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char * profile, break; case -EINVAL: LCONSOLE_ERROR("%s: The configuration '%s' could not be read " - "from the MDS. Make sure this client and the " - "MDS are running compatible versions of " + "from the MDS '%s'. Make sure this client and " + "the MDS are running compatible versions of " "Lustre.\n", - obd->obd_name, profile); + obd->obd_name, profile, lmd->lmd_mds); /* fall through */ default: - CERROR("class_config_parse_llog failed: rc = %d\n", rc); + LCONSOLE_ERROR("%s: The configuration '%s' could not be read " + "from the MDS '%s'. This may be the result of " + "communication errors between the client and " + "the MDS, or if the MDS is not running.\n", + obd->obd_name, profile, lmd->lmd_mds); break; } @@ -1113,7 +1122,6 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), CURRENT_SECONDS); - /* NB: ATTR_SIZE will only be set after this point if the size * resides on the MDS, ie, this file has no objects. */ if (lsm) @@ -1131,8 +1139,17 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) if (rc) { ptlrpc_req_finished(request); - if (rc != -EPERM && rc != -EACCES) + if (rc == -ENOENT) { + inode->i_nlink = 0; + /* Unlinked special device node? Or just a race? + * Pretend we done everything. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISDIR(inode->i_mode)) + rc = inode_setattr(inode, attr); + } else if (rc != -EPERM && rc != -EACCES) { CERROR("mdc_setattr fails: rc = %d\n", rc); + } RETURN(rc); } @@ -1196,15 +1213,15 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) if (attr->ia_size == 0) ast_flags = LDLM_AST_DISCARD_DATA; - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); UP_WRITE_I_ALLOC_SEM(inode); rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh, ast_flags); #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) DOWN_WRITE_I_ALLOC_SEM(inode); - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); #else - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); DOWN_WRITE_I_ALLOC_SEM(inode); #endif if (rc != 0) @@ -1251,8 +1268,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) int ll_setattr(struct dentry *de, struct iattr *attr) { - LBUG(); /* code is unused, but leave this in case of VFS changes */ - RETURN(-ENOSYS); + return ll_setattr_raw(de->d_inode, attr); } int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, @@ -1550,16 +1566,6 @@ void ll_read_inode2(struct inode *inode, void *opaque) #else init_special_inode(inode, inode->i_mode, inode->i_rdev); #endif - lli->ll_save_ifop = inode->i_fop; - - if (S_ISCHR(inode->i_mode)) - inode->i_fop = &ll_special_chr_inode_fops; - else if (S_ISBLK(inode->i_mode)) - inode->i_fop = &ll_special_blk_inode_fops; - else if (S_ISFIFO(inode->i_mode)) - inode->i_fop = &ll_special_fifo_inode_fops; - else if (S_ISSOCK(inode->i_mode)) - inode->i_fop = &ll_special_sock_inode_fops; EXIT; } } @@ -1599,7 +1605,7 @@ int ll_iocontrol(struct inode *inode, struct file *file, } case EXT3_IOC_SETFLAGS: { struct mdc_op_data op_data; - struct iattr attr; + struct ll_iattr_struct attr; struct obdo *oa; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; @@ -1614,10 +1620,10 @@ int ll_iocontrol(struct inode *inode, struct file *file, memset(&attr, 0x0, sizeof(attr)); attr.ia_attr_flags = flags; - attr.ia_valid |= ATTR_ATTR_FLAG; + ((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG; rc = mdc_setattr(sbi->ll_mdc_exp, &op_data, - &attr, NULL, 0, NULL, 0, &req); + (struct iattr *)&attr, NULL, 0, NULL, 0, &req); if (rc || lsm == NULL) { ptlrpc_req_finished(req); obdo_free(oa); @@ -1709,8 +1715,9 @@ int lustre_remount_fs(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { read_only = *flags & MS_RDONLY; - err = obd_set_info(sbi->ll_mdc_exp, strlen("read-only"), - "read-only", sizeof(read_only), &read_only); + err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"), + "read-only", sizeof(read_only), + &read_only, NULL); if (err) { CERROR("Failed to change the read-only flag during " "remount: %d\n", err); @@ -1791,7 +1798,6 @@ int ll_obd_statfs(struct inode *inode, void *arg) struct ll_sb_info *sbi = NULL; struct obd_device *client_obd = NULL, *lov_obd = NULL; struct lov_obd *lov = NULL; - struct obd_import *client_imp = NULL; struct obd_statfs stat_buf = {0}; char *buf = NULL; struct obd_ioctl_data *data = NULL; @@ -1817,7 +1823,6 @@ int ll_obd_statfs(struct inode *inode, void *arg) if (index > 0) GOTO(out_statfs, rc = -ENODEV); client_obd = class_exp2obd(sbi->ll_mdc_exp); - client_imp = class_exp2cliimp(sbi->ll_mdc_exp); } else if (type == LL_STATFS_LOV) { lov_obd = class_exp2obd(sbi->ll_osc_exp); lov = &lov_obd->u.lov; @@ -1826,12 +1831,11 @@ int ll_obd_statfs(struct inode *inode, void *arg) GOTO(out_statfs, rc = -ENODEV); client_obd = class_exp2obd(lov->tgts[index].ltd_exp); - client_imp = class_exp2cliimp(lov->tgts[index].ltd_exp); if (!lov->tgts[index].active) GOTO(out_uuid, rc = -ENODATA); } - if (!client_obd || !client_imp) + if (!client_obd) GOTO(out_statfs, rc = -EINVAL); rc = obd_statfs(client_obd, &stat_buf, jiffies - 1); @@ -1842,7 +1846,7 @@ int ll_obd_statfs(struct inode *inode, void *arg) GOTO(out_statfs, rc = -EFAULT); out_uuid: - if (copy_to_user(data->ioc_pbuf2, &client_imp->imp_target_uuid, + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(client_obd), data->ioc_plen2)) rc = -EFAULT; diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 8657ae3..06f23a1 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -406,7 +406,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, if (pgoff >= size) { lov_stripe_unlock(lsm); - ll_glimpse_size(inode, 0); + ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); } else { /* XXX change inode size without ll_inode_size_lock() held! * there is a race condition with truncate path. (see @@ -493,6 +493,9 @@ static void ll_vm_close(struct vm_area_struct *vma) } #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#ifndef HAVE_FILEMAP_POPULATE +static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +#endif static int ll_populate(struct vm_area_struct *area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock) @@ -599,6 +602,11 @@ int ll_file_mmap(struct file * file, struct vm_area_struct * vma) rc = generic_file_mmap(file, vma); if (rc == 0) { +#if !defined(HAVE_FILEMAP_POPULATE) && \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + if (!filemap_populate) + filemap_populate = vma->vm_ops->populate; +#endif vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ diff --git a/lustre/llite/llite_nfs.c b/lustre/llite/llite_nfs.c index b70ce8c..5fb7eef 100644 --- a/lustre/llite/llite_nfs.c +++ b/lustre/llite/llite_nfs.c @@ -101,7 +101,9 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino, { struct inode *inode; struct dentry *result; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct list_head *lp; +#endif if (ino == 0) return ERR_PTR(-ESTALE); @@ -121,6 +123,13 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino, return ERR_PTR(-ESTALE); } +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } +#else /* now to find a dentry. * If possible, get a well-connected one */ @@ -146,6 +155,7 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino, } result->d_flags |= DCACHE_DISCONNECTED; +#endif ll_set_dd(result); result->d_op = &ll_d_ops; return result; @@ -194,3 +204,57 @@ int ll_dentry_to_fh(struct dentry *dentry, __u32 *datap, int *lenp, *lenp = 3; return 1; } + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +struct dentry *ll_get_dentry(struct super_block *sb, void *data) +{ + __u32 *inump = (__u32*)data; + return ll_iget_for_nfs(sb, inump[0], inump[1], S_IFREG); +} + +struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct ptlrpc_request *req = NULL; + struct inode *dir = dchild->d_inode; + struct ll_sb_info *sbi; + struct dentry *result = NULL; + struct ll_fid fid; + struct mds_body *body; + char dotdot[] = ".."; + int rc = 0; + ENTRY; + + LASSERT(dir && S_ISDIR(dir->i_mode)); + + sbi = ll_s2sbi(dir->i_sb); + + fid.id = (__u64)dir->i_ino; + fid.generation = dir->i_generation; + fid.f_type = S_IFDIR; + + rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, dotdot, strlen(dotdot) + 1, + 0, 0, &req); + if (rc) { + CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino); + return ERR_PTR(rc); + } + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); + + LASSERT((body->valid & OBD_MD_FLGENER) && (body->valid & OBD_MD_FLID)); + + result = ll_iget_for_nfs(dir->i_sb, body->ino, body->generation, S_IFDIR); + + if (IS_ERR(result)) + rc = PTR_ERR(result); + + ptlrpc_req_finished(req); + if (rc) + return ERR_PTR(rc); + RETURN(result); +} + +struct export_operations lustre_export_operations = { + .get_parent = ll_get_parent, + .get_dentry = ll_get_dentry, +}; +#endif diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 1e9f1fc..a2f90e4 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -201,7 +201,7 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, return rc; if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) { - CERROR("can't set readahead more than %lu MB\n", + CERROR("can't set file readahead more than %lu MB\n", num_physpages >> (20 - PAGE_CACHE_SHIFT - 1)); return -ERANGE; } @@ -213,6 +213,50 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, return count; } +static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned val; + + spin_lock(&sbi->ll_lock); + val = sbi->ll_ra_info.ra_max_read_ahead_whole_pages >> + (20 - PAGE_CACHE_SHIFT); + spin_unlock(&sbi->ll_lock); + + return snprintf(page, count, "%u\n", val); +} + +static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + /* Cap this at the current max readahead window size, the readahead + * algorithm does this anyway so it's pointless to set it larger. */ + if (val < 0 || + val > (sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT))) { + CERROR("can't set max_read_ahead_whole_mb more than " + "max_read_ahead_mb: %lu\n", + sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + val << (20 - PAGE_CACHE_SHIFT); + spin_unlock(&sbi->ll_lock); + + return count; +} + static int ll_rd_max_cached_mb(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -280,8 +324,8 @@ static int ll_wr_checksum(struct file *file, const char *buffer, else sbi->ll_flags &= ~LL_SBI_CHECKSUM; - rc = obd_set_info(sbi->ll_osc_exp, strlen("checksum"), "checksum", - sizeof(val), &val); + rc = obd_set_info_async(sbi->ll_osc_exp, strlen("checksum"), "checksum", + sizeof(val), &val, NULL); if (rc) CWARN("Failed to set OSC checksum flags: %d\n", rc); @@ -301,6 +345,8 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, { "max_read_ahead_mb", ll_rd_max_readahead_mb, ll_wr_max_readahead_mb, 0 }, + { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, + ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 }, { 0 } @@ -692,6 +738,7 @@ static int ll_ra_stats_seq_show(struct seq_file *seq, void *v) [RA_STAT_ZERO_WINDOW] = "zero size window", [RA_STAT_EOF] = "read-ahead to EOF", [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", }; do_gettimeofday(&now); diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 19df9d9..eb57f8f 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -305,6 +305,12 @@ static void ll_d_add(struct dentry *de, struct inode *inode) __d_rehash(de, 0); } +/* 2.6.15 and prior versions have buggy d_instantiate_unique that leaks an inode + * if suitable alias is found. But we are not going to fix it by just freeing + * such inode, because if some vendor's kernel contains this bugfix already, + * we will break everything then. We will use our own reimplementation + * instead. */ +#if !defined(HAVE_D_ADD_UNIQUE) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)) /* Search "inode"'s alias list for a dentry that has the same name and parent as * de. If found, return it. If not found, return de. */ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) @@ -351,6 +357,21 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) return de; } +#else +struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) +{ + struct dentry *dentry; + + dentry = d_add_unique(de, inode); + if (dentry) { + lock_dentry(dentry); + dentry->d_flags &= ~DCACHE_LUSTRE_INVALID; + unlock_dentry(dentry); + } + + return dentry?dentry:de; +} +#endif static int lookup_it_finish(struct ptlrpc_request *request, int offset, struct lookup_intent *it, void *data) @@ -442,6 +463,11 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, GOTO(out, retval = ERR_PTR(rc)); } + if ((it->it_op & IT_OPEN) && dentry->d_inode && + !S_ISREG(dentry->d_inode->i_mode) && + !S_ISDIR(dentry->d_inode->i_mode)) { + ll_release_openhandle(dentry, it); + } ll_lookup_finish_locks(it, dentry); if (dentry == save) @@ -544,13 +570,6 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, RETURN(0); } -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) -{ - return ll_create_it(dir, dentry, mode, &nd->intent); -} -#endif - static void ll_update_times(struct ptlrpc_request *request, int offset, struct inode *inode) { @@ -569,17 +588,18 @@ static void ll_update_times(struct ptlrpc_request *request, int offset, LTIME_S(inode->i_ctime) = body->ctime; } -static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) +static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode, + unsigned rdev, struct dentry *dchild) { struct ptlrpc_request *request = NULL; - struct inode *dir = nd->dentry->d_inode; + struct inode *inode = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); struct mdc_op_data op_data; int err; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n", - nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir, + name->len, name->name, dir->i_ino, dir->i_generation, dir, mode, rdev); mode &= ~current->fs->umask; @@ -592,14 +612,23 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) case S_IFBLK: case S_IFIFO: case S_IFSOCK: - ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, - nd->last.len, 0); + ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name, + name->len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, current->fsuid, current->fsgid, current->cap_effective, rdev, &request); - if (err == 0) - ll_update_times(request, 0, dir); - ptlrpc_req_finished(request); + if (err) + break; + ll_update_times(request, 0, dir); + + if (dchild) { + err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, + dchild->d_sb); + if (err) + break; + + d_instantiate(dchild, inode); + } break; case S_IFDIR: err = -EPERM; @@ -607,64 +636,26 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) default: err = -EINVAL; } + ptlrpc_req_finished(request); RETURN(err); } -static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode, - ll_dev_t rdev) +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - struct ptlrpc_request *request = NULL; - struct inode *inode = NULL; - struct ll_sb_info *sbi = ll_i2sbi(dir); - struct mdc_op_data op_data; - int err; - ENTRY; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", - dchild->d_name.len, dchild->d_name.name, - dir->i_ino, dir->i_generation, dir); - mode &= ~current->fs->umask; - - switch (mode & S_IFMT) { - case 0: - case S_IFREG: - mode |= S_IFREG; /* for mode = 0 case, fallthrough */ - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: - ll_prepare_mdc_op_data(&op_data, dir, NULL, dchild->d_name.name, - dchild->d_name.len, 0); - err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, - current->fsuid, current->fsgid, - current->cap_effective, rdev, &request); - if (err) - GOTO(out_err, err); - - ll_update_times(request, 0, dir); - - err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, - dchild->d_sb); - if (err) - GOTO(out_err, err); - break; - case S_IFDIR: - RETURN(-EPERM); - break; - default: - RETURN(-EINVAL); + if (!nd || !nd->intent.d.lustre.it_disposition) { + /* No saved request? Just mknod the file */ + return ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry); } - d_instantiate(dchild, inode); - out_err: - ptlrpc_req_finished(request); - RETURN(err); + return ll_create_it(dir, dentry, mode, &nd->intent); } +#endif -static int ll_symlink_raw(struct nameidata *nd, const char *tgt) +static int ll_symlink_generic(struct inode *dir, struct qstr *name, + const char *tgt) { - struct inode *dir = nd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); struct mdc_op_data op_data; @@ -672,11 +663,11 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt) ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%s\n", - nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, + name->len, name->name, dir->i_ino, dir->i_generation, dir, tgt); - ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, - nd->last.len, 0); + ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name, + name->len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, current->fsuid, current->fsgid, current->cap_effective, @@ -688,10 +679,9 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt) RETURN(err); } -static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd) +static int ll_link_generic(struct inode *src, struct inode *dir, + struct qstr *name) { - struct inode *src = srcnd->dentry->d_inode; - struct inode *dir = tgtnd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct mdc_op_data op_data; int err; @@ -701,10 +691,10 @@ static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd) CDEBUG(D_VFSTRACE, "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n", src->i_ino, src->i_generation, src, dir->i_ino, - dir->i_generation, dir, tgtnd->last.len, tgtnd->last.name); + dir->i_generation, dir, name->len, name->name); - ll_prepare_mdc_op_data(&op_data, src, dir, tgtnd->last.name, - tgtnd->last.len, 0); + ll_prepare_mdc_op_data(&op_data, src, dir, name->name, + name->len, 0); err = mdc_link(sbi->ll_mdc_exp, &op_data, &request); if (err == 0) ll_update_times(request, 0, dir); @@ -714,54 +704,67 @@ static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd) RETURN(err); } +static int ll_mkdir_generic(struct inode *dir, struct qstr *name, int mode, + struct dentry *dchild) -static int ll_mkdir_raw(struct nameidata *nd, int mode) { - struct inode *dir = nd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); struct mdc_op_data op_data; + struct inode *inode = NULL; int err; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", - nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir); + name->len, name->name, dir->i_ino, dir->i_generation, dir); mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; - ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, - nd->last.len, 0); + ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name, + name->len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, current->fsuid, current->fsgid, current->cap_effective, 0, &request); - if (err == 0) - ll_update_times(request, 0, dir); + if (err) + GOTO(out, err); + ll_update_times(request, 0, dir); + if (dchild) { + err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, + dchild->d_sb); + if (err) + GOTO(out, err); + d_instantiate(dchild, inode); + } + EXIT; +out: ptlrpc_req_finished(request); - RETURN(err); + return err; } -static int ll_rmdir_raw(struct nameidata *nd) +static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, + struct qstr *name) { - struct inode *dir = nd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct mdc_op_data op_data; struct dentry *dentry; int rc; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", - nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir); + name->len, name->name, dir->i_ino, dir->i_generation, dir); /* Check if we have something mounted at the dir we are going to delete * In such a case there would always be dentry present. */ - dentry = d_lookup(nd->dentry, &nd->last); - if (dentry) { - int mounted = d_mountpoint(dentry); - dput(dentry); - if (mounted) - RETURN(-EBUSY); + if (dparent) { + dentry = d_lookup(dparent, name); + if (dentry) { + int mounted = d_mountpoint(dentry); + dput(dentry); + if (mounted) + RETURN(-EBUSY); + } } - ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, - nd->last.len, S_IFDIR); + ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name, + name->len, S_IFDIR); rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request); if (rc == 0) ll_update_times(request, 0, dir); @@ -843,18 +846,17 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) return rc; } -static int ll_unlink_raw(struct nameidata *nd) +static int ll_unlink_generic(struct inode * dir, struct qstr *name) { - struct inode *dir = nd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct mdc_op_data op_data; int rc; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", - nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir); + name->len, name->name, dir->i_ino, dir->i_generation, dir); - ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, - nd->last.len, 0); + ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name, + name->len, 0); rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request); if (rc) GOTO(out, rc); @@ -867,24 +869,23 @@ static int ll_unlink_raw(struct nameidata *nd) RETURN(rc); } -static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd) +static int ll_rename_generic(struct inode *src, struct qstr *src_name, + struct inode *tgt, struct qstr *tgt_name) { - struct inode *src = srcnd->dentry->d_inode; - struct inode *tgt = tgtnd->dentry->d_inode; struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(src); struct mdc_op_data op_data; int err; ENTRY; CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s," - "tgt_dir=%lu/%u(%p)\n", srcnd->last.len, srcnd->last.name, - src->i_ino, src->i_generation, src, tgtnd->last.len, - tgtnd->last.name, tgt->i_ino, tgt->i_generation, tgt); + "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name, + src->i_ino, src->i_generation, src, tgt_name->len, + tgt_name->name, tgt->i_ino, tgt->i_generation, tgt); ll_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0); err = mdc_rename(sbi->ll_mdc_exp, &op_data, - srcnd->last.name, srcnd->last.len, - tgtnd->last.name, tgtnd->last.len, &request); + src_name->name, src_name->len, + tgt_name->name, tgt_name->len, &request); if (!err) { ll_update_times(request, 0, src); ll_update_times(request, 0, tgt); @@ -896,6 +897,75 @@ static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd) RETURN(err); } +static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) +{ + return ll_mknod_generic(nd->dentry->d_inode, &nd->last, mode,rdev,NULL); +} +static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd) +{ + return ll_rename_generic(srcnd->dentry->d_inode, &srcnd->last, + tgtnd->dentry->d_inode, &tgtnd->last); +} +static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd) +{ + return ll_link_generic(srcnd->dentry->d_inode, tgtnd->dentry->d_inode, + &tgtnd->last); +} +static int ll_symlink_raw(struct nameidata *nd, const char *tgt) +{ + return ll_symlink_generic(nd->dentry->d_inode, &nd->last, tgt); +} +static int ll_rmdir_raw(struct nameidata *nd) +{ + return ll_rmdir_generic(nd->dentry->d_inode, nd->dentry, &nd->last); +} +static int ll_mkdir_raw(struct nameidata *nd, int mode) +{ + return ll_mkdir_generic(nd->dentry->d_inode, &nd->last, mode, NULL); +} +static int ll_unlink_raw(struct nameidata *nd) +{ + return ll_unlink_generic(nd->dentry->d_inode, &nd->last); +} + +static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode, + ll_dev_t rdev) +{ + return ll_mknod_generic(dir, &dchild->d_name, mode, + old_encode_dev(rdev), dchild); +} + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +static int ll_unlink(struct inode * dir, struct dentry *dentry) +{ + return ll_unlink_generic(dir, &dentry->d_name); +} +static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry); +} +static int ll_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ll_rmdir_generic(dir, NULL, &dentry->d_name); +} +static int ll_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname) +{ + return ll_symlink_generic(dir, &dentry->d_name, oldname); +} +static int ll_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name); +} +static int ll_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + return ll_rename_generic(old_dir, &old_dentry->d_name, new_dir, + &new_dentry->d_name); +} +#endif + struct inode_operations ll_dir_inode_operations = { .link_raw = ll_link_raw, .unlink_raw = ll_unlink_raw, @@ -914,7 +984,16 @@ struct inode_operations ll_dir_inode_operations = { #else .lookup = ll_lookup_nd, .create = ll_create_nd, - .getattr_it = ll_getattr, + .getattr_it = ll_getattr_it, + /* We need all these non-raw things for NFSD, to not patch it. */ + .unlink = ll_unlink, + .mkdir = ll_mkdir, + .rmdir = ll_rmdir, + .symlink = ll_symlink, + .link = ll_link, + .rename = ll_rename, + .setattr = ll_setattr, + .getattr = ll_getattr, #endif .permission = ll_inode_permission, .setxattr = ll_setxattr, @@ -922,3 +1001,18 @@ struct inode_operations ll_dir_inode_operations = { .listxattr = ll_listxattr, .removexattr = ll_removexattr, }; + +struct inode_operations ll_special_inode_operations = { + .setattr_raw = ll_setattr_raw, + .setattr = ll_setattr, +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + .getattr_it = ll_getattr_it, +#else + .revalidate_it = ll_inode_revalidate_it, +#endif + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, +}; diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 5f84234..0a54eca 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -102,7 +102,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, /* this isn't where truncate starts. roughly: * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs - * DLM lock on [size, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to + * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to * avoid races. * * must be called under ->lli_size_sem */ @@ -390,11 +390,11 @@ static struct obd_async_page_ops ll_async_page_ops = { struct ll_async_page *llap_cast_private(struct page *page) { - struct ll_async_page *llap = (struct ll_async_page *)page->private; + struct ll_async_page *llap = (struct ll_async_page *)page_private(page); LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC, "page %p private %lu gave magic %d which != %d\n", - page, page->private, llap->llap_magic, LLAP_MAGIC); + page, page_private(page), llap->llap_magic, LLAP_MAGIC); return llap; } @@ -518,10 +518,22 @@ static struct ll_async_page *llap_from_page(struct page *page, unsigned origin) struct ll_async_page *llap; struct obd_export *exp; struct inode *inode = page->mapping->host; - struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_sb_info *sbi; int rc; ENTRY; + if (!inode) { + static int triggered; + + if (!triggered) { + LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon " + "page received\n"); + libcfs_debug_dumpstack(NULL); + triggered = 1; + } + RETURN(ERR_PTR(-EINVAL)); + } + sbi = ll_i2sbi(inode); LASSERT(ll_async_page_slab); LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin); @@ -846,7 +858,7 @@ void ll_removepage(struct page *page) /* sync pages or failed read pages can leave pages in the page * cache that don't have our data associated with them anymore */ - if (page->private == 0) { + if (page_private(page) == 0) { EXIT; return; } @@ -958,10 +970,12 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping) } #define RAS_CDEBUG(ras) \ - CDEBUG(D_READA, "lrp %lu c %lu ws %lu wl %lu nra %lu\n", \ - ras->ras_last_readpage, ras->ras_consecutive, \ - ras->ras_window_start, ras->ras_window_len, \ - ras->ras_next_readahead); + CDEBUG(D_READA, \ + "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu\n", \ + ras->ras_last_readpage, ras->ras_consecutive_requests, \ + ras->ras_consecutive_pages, ras->ras_window_start, \ + ras->ras_window_len, ras->ras_next_readahead, \ + ras->ras_requests, ras->ras_request_index); static int index_in_window(unsigned long index, unsigned long point, unsigned long before, unsigned long after) @@ -989,9 +1003,13 @@ void ll_ra_read_in(struct file *f, struct ll_ra_read *rar) struct ll_readahead_state *ras; ras = ll_ras_get(f); - rar->lrr_reader = current; spin_lock(&ras->ras_lock); + ras->ras_requests++; + ras->ras_request_index = 0; + ras->ras_consecutive_requests++; + rar->lrr_reader = current; + list_add(&rar->lrr_linkage, &ras->ras_read_beads); spin_unlock(&ras->ras_lock); } @@ -1062,34 +1080,19 @@ static int ll_readahead(struct ll_readahead_state *ras, spin_lock(&ras->ras_lock); bead = ll_ra_read_get_locked(ras); - /* reserve a part of the read-ahead window that we'll be issuing */ + /* Enlarge the RA window to encompass the full read */ + if (bead != NULL && ras->ras_window_start + ras->ras_window_len < + bead->lrr_start + bead->lrr_count) { + ras->ras_window_len = bead->lrr_start + bead->lrr_count - + ras->ras_window_start; + } + /* Reserve a part of the read-ahead window that we'll be issuing */ if (ras->ras_window_len) { start = ras->ras_next_readahead; end = ras->ras_window_start + ras->ras_window_len - 1; } - if (bead != NULL) { - pgoff_t read_end; - - start = max(start, bead->lrr_start); - read_end = bead->lrr_start + bead->lrr_count - 1; - if (ras->ras_consecutive > start - bead->lrr_start + 1) - /* - * if current read(2) is a part of larger sequential - * read, make sure read-ahead is at least to the end - * of the read region. - * - * XXX nikita: This doesn't work when some pages in - * [lrr_start, start] were cached (and, as a result, - * weren't counted in ->ras_consecutive). - */ - end = max(end, read_end); - else - /* - * otherwise, clip read-ahead at the read boundary. - */ - end = read_end; - } if (end != 0) { + /* Truncate RA window to end of file */ end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT)); ras->ras_next_readahead = max(end, end + 1); RAS_CDEBUG(ras); @@ -1119,6 +1122,13 @@ static int ll_readahead(struct ll_readahead_state *ras, continue; } + /* Check if page was truncated or reclaimed */ + if (page->mapping != mapping) { + ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE); + CDEBUG(D_READA, "g_c_p_n returned invalid page\n"); + goto next_page; + } + /* we do this first so that we can see the page in the /proc * accounting */ llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD); @@ -1189,10 +1199,11 @@ static void ras_set_start(struct ll_readahead_state *ras, unsigned long index) static void ras_reset(struct ll_readahead_state *ras, unsigned long index) { ras->ras_last_readpage = index; - ras->ras_consecutive = 1; + ras->ras_consecutive_requests = 0; + ras->ras_consecutive_pages = 0; ras->ras_window_len = 0; ras_set_start(ras, index); - ras->ras_next_readahead = ras->ras_window_start; + ras->ras_next_readahead = max(ras->ras_window_start, index); RAS_CDEBUG(ras); } @@ -1201,11 +1212,13 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) { spin_lock_init(&ras->ras_lock); ras_reset(ras, 0); + ras->ras_requests = 0; INIT_LIST_HEAD(&ras->ras_read_beads); } -static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras, - unsigned long index, unsigned hit) +static void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit) { struct ll_ra_info *ra = &sbi->ll_ra_info; int zero = 0; @@ -1232,36 +1245,62 @@ static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras, ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW); } + /* On the second access to a file smaller than the tunable + * ra_max_read_ahead_whole_pages trigger RA on all pages in the + * file up to ra_max_pages. This is simply a best effort and + * only occurs once per open file. Normal RA behavior is reverted + * to for subsequent IO. The mmap case does not increment + * ras_requests and thus can never trigger this behavior. */ + if (ras->ras_requests == 2 && !ras->ras_request_index) { + __u64 kms_pages; + + kms_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, + ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages); + + if (kms_pages && + kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_window_start = 0; + ras->ras_last_readpage = 0; + ras->ras_next_readahead = 0; + ras->ras_window_len = min(ra->ra_max_pages, + ra->ra_max_read_ahead_whole_pages); + GOTO(out_unlock, 0); + } + } + if (zero) { ras_reset(ras, index); GOTO(out_unlock, 0); } ras->ras_last_readpage = index; - ras->ras_consecutive++; + ras->ras_consecutive_pages++; ras_set_start(ras, index); ras->ras_next_readahead = max(ras->ras_window_start, ras->ras_next_readahead); - /* wait for a few pages to arrive before issuing readahead to avoid - * the worst overutilization */ - if (ras->ras_consecutive == 3) { + /* Trigger RA in the mmap case where ras_consecutive_requests + * is not incremented and thus can't be used to trigger RA */ + if (!ras->ras_window_len && ras->ras_consecutive_pages == 3) { ras->ras_window_len = PTLRPC_MAX_BRW_PAGES; GOTO(out_unlock, 0); } - /* we need to increase the window sometimes. we'll arbitrarily - * do it half-way through the pages in an rpc */ - if ((index & (PTLRPC_MAX_BRW_PAGES - 1)) == - (PTLRPC_MAX_BRW_PAGES >> 1)) { - ras->ras_window_len += PTLRPC_MAX_BRW_PAGES; - ras->ras_window_len = min(ras->ras_window_len, + /* The initial ras_window_len is set to the request size. To avoid + * uselessly reading and discarding pages for random IO the window is + * only increased once per consecutive request received. */ + if (ras->ras_consecutive_requests > 1 && !ras->ras_request_index) { + ras->ras_window_len = min(ras->ras_window_len + + PTLRPC_MAX_BRW_PAGES, ra->ra_max_pages); } EXIT; out_unlock: RAS_CDEBUG(ras); + ras->ras_request_index++; spin_unlock(&ras->ras_lock); spin_unlock(&sbi->ll_lock); return; @@ -1337,6 +1376,17 @@ int ll_readpage(struct file *filp, struct page *page) (((loff_t)page->index) << PAGE_SHIFT)); LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0); + if (!ll_i2info(inode)->lli_smd) { + /* File with no objects - one big hole */ + /* We use this just for remove_from_page_cache that is not + * exported, we'd make page back up to date. */ + ll_truncate_complete_page(page); + clear_page(page); + SetPageUptodate(page); + unlock_page(page); + RETURN(0); + } + rc = oig_init(&oig); if (rc < 0) GOTO(out, rc); @@ -1350,7 +1400,7 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out, rc = PTR_ERR(llap)); if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) - ras_update(ll_i2sbi(inode), &fd->fd_ras, page->index, + ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index, llap->llap_defer_uptodate); if (llap->llap_defer_uptodate) { @@ -1366,17 +1416,19 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out_oig, rc = 0); } - rc = ll_page_matches(page, fd->fd_flags); - if (rc < 0) { - LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc); - GOTO(out, rc); - } + if (likely((fd->fd_flags & LL_FILE_IGNORE_LOCK) == 0)) { + rc = ll_page_matches(page, fd->fd_flags); + if (rc < 0) { + LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc); + GOTO(out, rc); + } - if (rc == 0) { - CWARN("ino %lu page %lu (%llu) not covered by " - "a lock (mmap?). check debug logs.\n", - inode->i_ino, page->index, - (long long)page->index << PAGE_CACHE_SHIFT); + if (rc == 0) { + CWARN("ino %lu page %lu (%llu) not covered by " + "a lock (mmap?). check debug logs.\n", + inode->i_ino, page->index, + (long long)page->index << PAGE_CACHE_SHIFT); + } } rc = ll_issue_page_read(exp, llap, oig, 0); diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 222e779a..df1c812 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -67,7 +67,7 @@ static int ll_invalidatepage(struct page *page, unsigned long offset) return 1; } -static int ll_releasepage(struct page *page, int gfp_mask) +static int ll_releasepage(struct page *page, gfp_t gfp_mask) { if (PagePrivate(page)) ll_removepage(page); diff --git a/lustre/llite/special.c b/lustre/llite/special.c deleted file mode 100644 index 9410fb0..0000000 --- a/lustre/llite/special.c +++ /dev/null @@ -1,391 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Special file handling for Lustre. - * - * Copyright (c) 2002, 2003 Cluster File Systems, Inc. - * Author: Wang Di - * Author: Andreas Dilger - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_LLITE -#include -#include -#include -#include -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#include -#endif -#include -#include "llite_internal.h" - -#define INODE_OPS 1 -#define FILE_OPS 2 - -static struct file_operations **get_save_fops(struct file* filp, int mode) -{ - struct inode *inode = filp->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - - if (mode == INODE_OPS) { - return &(lli->ll_save_ifop); - } else if (mode == FILE_OPS) { - if (S_ISFIFO(inode->i_mode)) { - switch (filp->f_mode) { - case 1: /*O_RDONLY*/ - return &(lli->ll_save_ffop); - case 2: /*O_WRONLY*/ - return &(lli->ll_save_wfop); - case 3: /* O_RDWR */ - return &(lli->ll_save_wrfop); - default: - return NULL; - } - } - return &(lli->ll_save_ffop); - } else { - CERROR("invalid special file ops %d\n", mode); - LBUG(); - return NULL; - } -} - -static void save_fops(struct file *filp, struct inode *inode, - struct file_operations *sfops) -{ - if (sfops != filp->f_op) { - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - - *pfop = filp->f_op; - if (S_ISCHR(inode->i_mode)) - filp->f_op = &ll_special_chr_file_fops; - else if (S_ISFIFO(inode->i_mode)) - filp->f_op = &ll_special_fifo_file_fops; - } -} - -static ssize_t ll_special_file_read(struct file *filp, char *buf, - size_t count, loff_t *ppos) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->read) - rc = (*pfop)->read(filp, buf, count, ppos); - - RETURN(rc); -} - -static ssize_t ll_special_file_write(struct file *filp, const char *buf, - size_t count, loff_t *ppos) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->write) - rc = (*pfop)->write(filp, buf, count, ppos); - - RETURN(rc); -} - -static int ll_special_file_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = -ENOTTY; - - if (pfop && *pfop && (*pfop)->ioctl) { - struct file_operations *sfops = filp->f_op; - - rc = (*pfop)->ioctl(inode, filp, cmd, arg); - save_fops(filp, inode, sfops); - } - RETURN(rc); -} - -static loff_t ll_special_file_seek(struct file *filp, loff_t offset, int origin) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = 0; - - if (pfop && *pfop && (*pfop)->llseek) - rc = (*pfop)->llseek(filp, offset, origin); - else - rc = default_llseek(filp, offset, origin); - - RETURN(rc); -} - - -#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) - -static unsigned int ll_special_file_poll(struct file *filp, - struct poll_table_struct *poll_table) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = DEFAULT_POLLMASK; - - if (pfop && *pfop && (*pfop)->poll) - rc = (*pfop)->poll(filp, poll_table); - - RETURN(rc); -} - -static int ll_special_file_open(struct inode *inode, struct file *filp) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->open) - rc = (*pfop)->open(inode, filp); - - RETURN(rc); -} - -static ssize_t ll_special_read(struct file *filp, char *buf, size_t count, - loff_t *ppos) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->read) - rc = (*pfop)->read(filp, buf, count, ppos); - - RETURN(rc); -} - -static ssize_t ll_special_write(struct file *filp, const char *buf, - size_t count, loff_t *ppos) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->write) - rc = (*pfop)->write(filp, buf, count, ppos); - - RETURN(rc); -} - -static int ll_special_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - int rc = -ENOTTY; - - if (pfop && *pfop && (*pfop)->ioctl) { - struct file_operations *sfops = filp->f_op; - - rc = (*pfop)->ioctl(inode, filp, cmd, arg); - - /* sometimes, file_operations will be changed in ioctl */ - save_fops(filp, inode, sfops); - } - - RETURN(rc); -} - -static int ll_special_mmap(struct file * filp, struct vm_area_struct * vma) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - int rc = -ENODEV; - - if (pfop && *pfop && (*pfop)->mmap) - rc = (*pfop)->mmap(filp, vma); - - RETURN(rc); -} - -static loff_t ll_special_seek(struct file *filp, loff_t offset, int origin) -{ - struct file_operations** pfop = get_save_fops (filp, INODE_OPS); - int rc = 0; - - if (pfop && *pfop && (*pfop)->llseek) - rc = (*pfop)->llseek(filp, offset, origin); - else - rc = default_llseek(filp, offset, origin); - - RETURN(rc); -} - -static int ll_special_fsync(struct file *filp, struct dentry *dentry, int data) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->fsync) - rc = (*pfop)->fsync(filp, dentry, data); - - RETURN(rc); -} - -static int ll_special_file_fasync(int fd, struct file *filp, int on) -{ - struct file_operations **pfop = get_save_fops(filp, FILE_OPS); - int rc = -EINVAL; - - if (pfop && *pfop && (*pfop)->fasync) - rc = (*pfop)->fasync(fd, filp, on); - - RETURN(rc); -} - -static int ll_special_release_internal(struct inode *inode, struct file *filp, - int mode) -{ - struct file_operations **pfop = get_save_fops(filp, mode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc = 0, err; - ENTRY; - - if (pfop && *pfop) { - if ((*pfop)->release) - rc = (*pfop)->release(inode, filp); - /* FIXME fops_put */ - } - - lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE); - - err = ll_mdc_close(sbi->ll_mdc_exp, inode, filp); - if (err && rc == 0) - rc = err; - - RETURN(rc); -} - -static int ll_special_open(struct inode *inode, struct file *filp) -{ - struct file_operations **pfop = get_save_fops(filp, INODE_OPS); - struct file_operations *sfops = filp->f_op; - struct ptlrpc_request *req; - struct lookup_intent *it; - struct ll_file_data *fd; - int rc = -EINVAL, err; - ENTRY; - - fd = ll_file_data_get(); - if (fd == NULL) - RETURN(-ENOMEM); - - if (pfop && *pfop) { - /* FIXME fops_get */ - if ((*pfop)->open) { - rc = (*pfop)->open(inode, filp); - - /* sometimes file_operations will be changed in open */ - save_fops(filp, inode, sfops); - } - } - - lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN); - - it = filp->f_it; - - err = ll_local_open(filp, it, fd); - if (rc != 0) { - CERROR("error opening special file: rc %d\n", rc); - ll_mdc_close(ll_i2sbi(inode)->ll_mdc_exp, inode, filp); - } else if (err) { - if (pfop && *pfop && (*pfop)->release) - (*pfop)->release(inode, filp); - /* FIXME fops_put */ - rc = err; - } - - req = it->d.lustre.it_data; - if (req) - ptlrpc_req_finished(req); - - RETURN(rc); -} - -static int ll_special_release(struct inode *inode, struct file *filp) -{ - return ll_special_release_internal(inode, filp, INODE_OPS); -} - -static int ll_special_file_release(struct inode *inode, struct file *filp) -{ - return ll_special_release_internal(inode, filp, FILE_OPS); -} - -struct inode_operations ll_special_inode_operations = { - .setattr_raw = ll_setattr_raw, - .setattr = ll_setattr, -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - .getattr_it = ll_getattr, -#else - .revalidate_it = ll_inode_revalidate_it, -#endif - .permission = ll_inode_permission, - .setxattr = ll_setxattr, - .getxattr = ll_getxattr, - .listxattr = ll_listxattr, - .removexattr = ll_removexattr, -}; - -struct file_operations ll_special_chr_inode_fops = { - .owner = THIS_MODULE, - .open = ll_special_open, -}; - -struct file_operations ll_special_blk_inode_fops = { - .owner = THIS_MODULE, - .read = ll_special_read, - .write = ll_special_write, - .ioctl = ll_special_ioctl, - .open = ll_special_open, - .release = ll_special_release, - .mmap = ll_special_mmap, - .llseek = ll_special_seek, - .fsync = ll_special_fsync, -}; - -struct file_operations ll_special_fifo_inode_fops = { - .owner = THIS_MODULE, - .open = ll_special_open, -}; - -struct file_operations ll_special_sock_inode_fops = { - .owner = THIS_MODULE, - .open = ll_special_open -}; - -struct file_operations ll_special_chr_file_fops = { - .owner = THIS_MODULE, - .llseek = ll_special_file_seek, - .read = ll_special_file_read, - .write = ll_special_file_write, - .poll = ll_special_file_poll, - .ioctl = ll_special_file_ioctl, - .open = ll_special_file_open, - .release = ll_special_file_release, - .fasync = ll_special_file_fasync, -}; - -struct file_operations ll_special_fifo_file_fops = { - .owner = THIS_MODULE, - .llseek = ll_special_file_seek, - .read = ll_special_file_read, - .write = ll_special_file_write, - .poll = ll_special_file_poll, - .ioctl = ll_special_file_ioctl, - .open = ll_special_file_open, - .release = ll_special_file_release, -}; - diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index 51050d6..c663371 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -159,7 +159,7 @@ struct inode_operations ll_fast_symlink_inode_operations = { #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) .revalidate_it = ll_inode_revalidate_it, #else - .getattr_it = ll_getattr, + .getattr_it = ll_getattr_it, #endif .permission = ll_inode_permission, .setxattr = ll_setxattr, diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c index 39c1c33..44664a7 100644 --- a/lustre/llite/xattr.c +++ b/lustre/llite/xattr.c @@ -46,15 +46,18 @@ #define XATTR_USER_T (1) #define XATTR_TRUSTED_T (2) #define XATTR_SECURITY_T (3) -#define XATTR_ACL_T (4) -#define XATTR_OTHER_T (5) +#define XATTR_ACL_ACCESS_T (4) +#define XATTR_ACL_DEFAULT_T (5) +#define XATTR_OTHER_T (6) static int get_xattr_type(const char *name) { - if (!strcmp(name, XATTR_NAME_ACL_ACCESS) || - !strcmp(name, XATTR_NAME_ACL_DEFAULT)) - return XATTR_ACL_T; + if (!strcmp(name, XATTR_NAME_ACL_ACCESS)) + return XATTR_ACL_ACCESS_T; + + if (!strcmp(name, XATTR_NAME_ACL_DEFAULT)) + return XATTR_ACL_DEFAULT_T; if (!strncmp(name, XATTR_USER_PREFIX, sizeof(XATTR_USER_PREFIX) - 1)) @@ -74,8 +77,11 @@ int get_xattr_type(const char *name) static int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type) { - if (xattr_type == XATTR_ACL_T && !(sbi->ll_flags & LL_SBI_ACL)) + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && + !(sbi->ll_flags & LL_SBI_ACL)) return -EOPNOTSUPP; + if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR)) return -EOPNOTSUPP; if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN)) @@ -180,6 +186,26 @@ int ll_getxattr_common(struct inode *inode, const char *name, if (rc) RETURN(rc); + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (xattr_type == XATTR_ACL_ACCESS_T) { + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + RETURN(-ENODATA); + + rc = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + RETURN(rc); + } + do_getxattr: ll_inode2fid(&fid, inode); rc = mdc_getxattr(sbi->ll_mdc_exp, &fid, valid, name, NULL, 0, diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c index 62e1956..ff20962 100644 --- a/lustre/lov/lov_merge.c +++ b/lustre/lov/lov_merge.c @@ -111,11 +111,11 @@ int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, for (loi = lsm->lsm_oinfo; stripe < lsm->lsm_stripe_count; stripe++, loi++) { kms = lov_size_to_stripe(lsm, size, stripe); - loi->loi_kms = loi->loi_lvb.lvb_size = kms; CDEBUG(D_INODE, "stripe %d KMS %sing "LPU64"->"LPU64"\n", stripe, kms > loi->loi_kms ? "increas":"shrink", loi->loi_kms, kms); + loi->loi_kms = loi->loi_lvb.lvb_size = kms; } RETURN(0); } diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 9572772..4fcc9d1 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -396,7 +396,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, watched->obd_name); RETURN(-EINVAL); } - uuid = &watched->u.cli.cl_import->imp_target_uuid; + uuid = &watched->u.cli.cl_target_uuid; if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { /* Set OSC as active before notifying the observer, so the @@ -520,8 +520,8 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) if (rc) GOTO(out, rc); - rc = obd_set_info(obd->obd_observer->obd_self_export, - strlen("next_id"),"next_id", 2, params); + rc = obd_set_info_async(obd->obd_observer->obd_self_export, + strlen("next_id"),"next_id", 2, params, NULL); if (rc) GOTO(out, rc); @@ -703,7 +703,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) RETURN(0); } -static int lov_precleanup(struct obd_device *obd, int stage) +static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) { int rc = 0; ENTRY; @@ -720,10 +720,15 @@ static int lov_precleanup(struct obd_device *obd, int stage) } break; } + case OBD_CLEANUP_EXPORTS: + break; case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); + break; + case OBD_CLEANUP_OBD: + break; } RETURN(rc); } @@ -2150,14 +2155,22 @@ out: RETURN(rc); } -static int lov_set_info(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val) +static int lov_set_info_async(struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; int i, rc = 0, err; + int no_set = !set; ENTRY; + if (no_set) { + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + if (KEY_IS("next_id")) { if (vallen != lov->desc.ld_tgt_count) RETURN(-EINVAL); @@ -2173,8 +2186,9 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, continue; /* hit all OSCs, even inactive ones */ - err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key, - vallen, ((obd_id*)val) + i); + err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen, + key, vallen, + ((obd_id*)val) + i, set); if (!rc) rc = err; } @@ -2187,8 +2201,8 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, if (!lov->tgts[i].ltd_exp || !lov->tgts[i].active) continue; - err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key, - vallen, val); + err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen, + key, vallen, val, set); if (!rc) rc = err; } @@ -2213,13 +2227,19 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, if (!val && !lov->tgts[i].active) continue; - err = obd_set_info(lov->tgts[i].ltd_exp, - keylen, key, vallen, val); + err = obd_set_info_async(lov->tgts[i].ltd_exp, + keylen, key, vallen, val, set); if (!rc) rc = err; } out: lov_putref(obddev); + if (no_set) { + err = ptlrpc_set_wait(set); + if (!rc) + rc = err; + ptlrpc_set_destroy(set); + } RETURN(rc); } @@ -2398,7 +2418,7 @@ struct obd_ops lov_obd_ops = { .o_join_lru = lov_join_lru, .o_iocontrol = lov_iocontrol, .o_get_info = lov_get_info, - .o_set_info = lov_set_info, + .o_set_info_async = lov_set_info_async, .o_llog_init = lov_llog_init, .o_llog_finish = lov_llog_finish, .o_notify = lov_notify, diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index c144cb1..34da8d1 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -489,11 +489,7 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, { int rc; - LASSERT_SEM_LOCKED(&inode->i_sem); - - if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) - CWARN("setting EA on %lu/%u again... interesting\n", - inode->i_ino, inode->i_generation); + LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0); lock_24kernel(); rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, @@ -507,13 +503,13 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, return rc; } -/* Must be called with i_sem held */ +/* Must be called with i_mutex held */ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size, const char *name) { int rc; - LASSERT_SEM_LOCKED(&inode->i_sem); + LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0); lock_24kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, @@ -831,7 +827,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, return EXT_CONTINUE; } - tgen = EXT_GENERATION(tree); + tgen = EXT_GENERATION(EXT_ROOT_HDR(tree)); count = ext3_ext_calc_credits_for_insert(tree, path); ext3_up_truncate_sem(inode); @@ -844,7 +840,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, } ext3_down_truncate_sem(inode); - if (tgen != EXT_GENERATION(tree)) { + if (tgen != EXT_GENERATION(EXT_ROOT_HDR(tree))) { /* the tree has changed. so path can be invalid at moment */ lock_24kernel(); journal_stop(handle); diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 1aec9f9..188f8be 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -505,6 +505,7 @@ static void __exit lvfs_linux_exit(void) CDEBUG(leaked ? D_ERROR : D_INFO, "obd mem max: %d leaked: %d\n", obd_memmax, leaked); + EXIT; return; } diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 41162e6..6158722 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -71,7 +71,3 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, up(&lck->rpcl_sem); } } - -/* Quota stuff */ -extern quota_interface_t *quota_interface; - diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index 29be9d6..23c79f0 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -107,7 +107,7 @@ void mdc_create_pack(struct ptlrpc_request *req, int offset, static __u32 mds_pack_open_flags(__u32 flags) { return - (flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC | + (flags & (FMODE_READ | FMODE_WRITE | MDS_OPEN_DELAY_CREATE | MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | MDS_OPEN_OWNEROVERRIDE)) | ((flags & O_CREAT) ? MDS_OPEN_CREAT : 0) | @@ -117,6 +117,9 @@ static __u32 mds_pack_open_flags(__u32 flags) ((flags & O_SYNC) ? MDS_OPEN_SYNC : 0) | ((flags & O_DIRECTORY) ? MDS_OPEN_DIRECTORY : 0) | ((flags & O_JOIN_FILE) ? MDS_OPEN_JOIN_FILE : 0) | +#ifdef FMODE_EXEC + ((flags & FMODE_EXEC) ? MDS_FMODE_EXEC : 0) | +#endif 0; } @@ -189,7 +192,9 @@ void mdc_setattr_pack(struct ptlrpc_request *req, int offset, rec->sa_atime = LTIME_S(iattr->ia_atime); rec->sa_mtime = LTIME_S(iattr->ia_mtime); rec->sa_ctime = LTIME_S(iattr->ia_ctime); - rec->sa_attr_flags = iattr->ia_attr_flags; + rec->sa_attr_flags = + ((struct ll_iattr_struct *)iattr)->ia_attr_flags; + if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid)) rec->sa_suppgid = iattr->ia_gid; else diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 734f37e..84de47c 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -341,7 +341,8 @@ int mdc_enqueue(struct obd_export *exp, repsize[repbufcnt++] = obddev->u.cli.cl_max_mds_cookiesize; } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | - OBD_MD_FLACL | OBD_MD_FLMODEASIZE; + OBD_MD_FLACL | OBD_MD_FLMODEASIZE | + OBD_MD_FLDIREA; size[req_buffers++] = sizeof(struct mds_body); size[req_buffers++] = data->namelen + 1; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index f338d67..580cebc 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -41,6 +41,8 @@ #include #include "mdc_internal.h" +static quota_interface_t *quota_interface; + #define REQUEST_MINOR 244 static int mdc_cleanup(struct obd_device *obd); @@ -681,6 +683,9 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, EXIT; *request = req; out: + if (rc != 0 && req && req->rq_commit_cb) + req->rq_commit_cb(req); + return rc; } @@ -826,8 +831,9 @@ out: return rc; } -int mdc_set_info(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val) +int mdc_set_info_async(struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) { struct obd_import *imp = class_exp2cliimp(exp); int rc = -EINVAL; @@ -873,8 +879,14 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen, RETURN(-ENOMEM); req->rq_replen = lustre_msg_size(0, NULL); - rc = ptlrpc_queue_wait(req); - ptlrpc_req_finished(req); + if (set) { + rc = 0; + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(set); + } else { + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } RETURN(rc); } @@ -1170,7 +1182,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp) RETURN(0); } -static int mdc_precleanup(struct obd_device *obd, int stage) +static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) { int rc = 0; ENTRY; @@ -1246,7 +1258,7 @@ struct obd_ops mdc_obd_ops = { .o_connect = client_connect_import, .o_disconnect = client_disconnect_export, .o_iocontrol = mdc_iocontrol, - .o_set_info = mdc_set_info, + .o_set_info_async = mdc_set_info_async, .o_get_info = mdc_get_info, .o_statfs = mdc_statfs, .o_pin = mdc_pin, @@ -1256,7 +1268,6 @@ struct obd_ops mdc_obd_ops = { .o_llog_finish = mdc_llog_finish, }; -static quota_interface_t *quota_interface; extern quota_interface_t mdc_quota_interface; int __init mdc_init(void) diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index ea6cb0a..71d31f8 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -61,6 +61,10 @@ #include "mds_internal.h" +int mds_num_threads; +CFS_MODULE_PARM(mds_num_threads, "i", int, 0444, + "number of MDS service threads to start"); + static int mds_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, void *req_cookie, ldlm_mode_t mode, int flags, void *data); @@ -222,9 +226,9 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, if (inode->i_generation == 0 || inode->i_nlink == 0) { LCONSOLE_WARN("Found inode with zero generation or link -- this" - " may indicate disk corruption (inode: %lu, link:" - " %lu, count: %d)\n", inode->i_ino, - (unsigned long)inode->i_nlink, + " may indicate disk corruption (inode: %lu/%u, " + "link %lu, count %d)\n", inode->i_ino, + inode->i_generation,(unsigned long)inode->i_nlink, atomic_read(&inode->i_count)); dput(result); RETURN(ERR_PTR(-ENOENT)); @@ -391,7 +395,7 @@ static int mds_destroy_export(struct obd_export *export) target_destroy_export(export); if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid)) - GOTO(out, 0); + RETURN(0); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); /* Close any open files (which may also cause orphan unlinking). */ @@ -424,7 +428,6 @@ static int mds_destroy_export(struct obd_export *export) } spin_unlock(&med->med_open_lock); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); -out: mds_client_free(export); RETURN(rc); @@ -485,6 +488,10 @@ static int mds_getstatus(struct ptlrpc_request *req) RETURN(0); } +/* get the LOV EA from @inode and store it into @md. It can be at most + * @size bytes, and @size is updated with the actual EA size. + * The EA size is also returned on success, and -ve errno on failure. + * If there is no EA then 0 is returned. */ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md, int *size, int lock) { @@ -492,7 +499,7 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md, int lmm_size; if (lock) - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); rc = fsfilt_get_md(obd, inode, md, *size, "lov"); if (rc < 0) { @@ -512,14 +519,14 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md, *size = 0; } if (lock) - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); RETURN (rc); } -/* Call with lock=1 if you want mds_pack_md to take the i_sem. - * Call with lock=0 if the caller has already taken the i_sem. */ +/* Call with lock=1 if you want mds_pack_md to take the i_mutex. + * Call with lock=0 if the caller has already taken the i_mutex. */ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset, struct mds_body *body, struct inode *inode, int lock) { @@ -698,7 +705,7 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, { struct mds_obd *mds = mds_req2mds(req); struct mds_body *body; - int rc, size[2] = {sizeof(*body)}, bufcount = 1; + int rc, size[3] = {sizeof(*body)}, bufcount = 1; ENTRY; body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body)); @@ -707,10 +714,10 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) || (S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0, "lov"); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n", rc, inode->i_ino); if (rc < 0) { @@ -1853,7 +1860,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3), LUSTRE_CFG_BUFLEN(lcfg, 3)); - } ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, @@ -1900,8 +1906,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recoverable_clients, - (obd->obd_recoverable_clients == 1) - ? "client" : "clients", + (obd->obd_recoverable_clients == 1) ? + "client" : "clients", (int)(OBD_RECOVERY_TIMEOUT) / 60, (int)(OBD_RECOVERY_TIMEOUT) % 60, obd->obd_name); @@ -1913,7 +1919,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) } ldlm_timeout = 2; - ping_evictor_start(); RETURN(0); @@ -1952,6 +1957,9 @@ static int mds_lov_clean(struct obd_device *obd) /* There better be a lov */ if (!osc) RETURN(0); + + if (IS_ERR(osc)) + RETURN(PTR_ERR(osc)); obd_register_observer(osc, NULL); @@ -2078,12 +2086,14 @@ static int mds_lov_early_clean(struct obd_device *obd) return (obd_precleanup(osc, OBD_CLEANUP_EARLY)); } -static int mds_precleanup(struct obd_device *obd, int stage) +static int mds_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) { int rc = 0; ENTRY; switch (stage) { + case OBD_CLEANUP_EARLY: + break; case OBD_CLEANUP_EXPORTS: target_cleanup_recovery(obd); mds_lov_early_clean(obd); @@ -2094,6 +2104,9 @@ static int mds_precleanup(struct obd_device *obd, int stage) llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT)); llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT)); rc = obd_llog_finish(obd, 0); + break; + case OBD_CLEANUP_OBD: + break; } RETURN(rc); } @@ -2105,8 +2118,6 @@ static int mds_cleanup(struct obd_device *obd) int must_relock = 0; ENTRY; - ping_evictor_stop(); - if (obd->u.obt.obt_sb == NULL) RETURN(0); save_dev = lvfs_sbdev(obd->u.obt.obt_sb); @@ -2340,7 +2351,7 @@ static int mds_intent_policy(struct ldlm_namespace *ns, break; default: CERROR("Unhandled intent "LPD64"\n", it->opc); - LBUG(); + RETURN(-EFAULT); } /* By this point, whatever function we called above must have either @@ -2413,12 +2424,17 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) sema_init(&mds->mds_health_sem, 1); + if (mds_num_threads < 2) + mds_num_threads = MDS_DEF_THREADS; + if (mds_num_threads > MDS_MAX_THREADS) + mds_num_threads = MDS_MAX_THREADS; + mds->mds_service = ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, LUSTRE_MDS_NAME, - obd->obd_proc_entry, NULL, MDT_NUM_THREADS); + obd->obd_proc_entry, NULL, mds_num_threads); if (!mds->mds_service) { CERROR("failed to start service\n"); @@ -2434,7 +2450,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) MDS_MAXREPSIZE, MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, "mds_setattr", - obd->obd_proc_entry, NULL, MDT_NUM_THREADS); + obd->obd_proc_entry, NULL, mds_num_threads); if (!mds->mds_setattr_service) { CERROR("failed to start getattr service\n"); GOTO(err_thread, rc = -ENOMEM); @@ -2450,7 +2466,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) MDS_MAXREPSIZE, MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, "mds_readpage", - obd->obd_proc_entry, NULL, MDT_NUM_THREADS); + obd->obd_proc_entry, NULL, mds_num_threads); if (!mds->mds_readpage_service) { CERROR("failed to start readpage service\n"); GOTO(err_thread2, rc = -ENOMEM); @@ -2462,6 +2478,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) if (rc) GOTO(err_thread3, rc); + ping_evictor_start(); + RETURN(0); err_thread3: @@ -2483,6 +2501,8 @@ static int mdt_cleanup(struct obd_device *obd) struct mds_obd *mds = &obd->u.mds; ENTRY; + ping_evictor_stop(); + down(&mds->mds_health_sem); ptlrpc_unregister_service(mds->mds_readpage_service); ptlrpc_unregister_service(mds->mds_setattr_service); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 389acb9..1140a61 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -52,6 +52,7 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, struct obd_device *obd = data; struct mds_obd *mds = &obd->u.mds; char tmpbuf[sizeof(struct obd_uuid)]; + struct ptlrpc_request_set *set; int rc; sscanf(buffer, "%40s", tmpbuf); @@ -59,14 +60,25 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, if (strncmp(tmpbuf, "nid:", 4) != 0) return lprocfs_wr_evict_client(file, buffer, count, data); - obd_export_evict_by_nid(obd, tmpbuf+4); + set = ptlrpc_prep_set(); + if (!set) + return -ENOMEM; - rc = obd_set_info(mds->mds_osc_exp, strlen("evict_by_nid"), - "evict_by_nid", strlen(tmpbuf + 4) + 1, tmpbuf + 4); + rc = obd_set_info_async(mds->mds_osc_exp, strlen("evict_by_nid"), + "evict_by_nid", strlen(tmpbuf + 4) + 1, + tmpbuf + 4, set); if (rc) CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4, rc); + ptlrpc_check_set(set); + + obd_export_evict_by_nid(obd, tmpbuf+4); + rc = ptlrpc_set_wait(set); + if (rc) + CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4, + rc); + ptlrpc_set_destroy(set); return count; } diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index ba67c02..8cd2dc9 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -252,8 +252,10 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) GOTO(err_msd, rc); } if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) { - CERROR("OBD UUID %s does not match last_rcvd UUID %s\n", - obd->obd_uuid.uuid, msd->msd_uuid); + LCONSOLE_ERROR("Trying to start OBD %s using the wrong" + " disk %s. Were the /dev/ assignments " + "rearranged?\n", + obd->obd_uuid.uuid, msd->msd_uuid); GOTO(err_msd, rc = -EINVAL); } mount_count = le64_to_cpu(msd->msd_mount_count); @@ -648,7 +650,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, oa->o_generation = filp->f_dentry->d_inode->i_generation; namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation); - down(&parent_inode->i_sem); + LOCK_INODE_MUTEX(parent_inode); new_child = lookup_one_len(fidname, mds->mds_objects_dir, namelen); if (IS_ERR(new_child)) { @@ -683,7 +685,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, out_dput: dput(new_child); out_close: - up(&parent_inode->i_sem); + UNLOCK_INODE_MUTEX(parent_inode); err = filp_close(filp, 0); if (err) { CERROR("closing tmpfile %u: rc %d\n", tmpname, rc); @@ -715,7 +717,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation); - down(&parent_inode->i_sem); + LOCK_INODE_MUTEX(parent_inode); de = lookup_one_len(fidname, mds->mds_objects_dir, namelen); if (IS_ERR(de)) { rc = IS_ERR(de); @@ -749,7 +751,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, out_dput: if (de != NULL) l_dput(de); - up(&parent_inode->i_sem); + UNLOCK_INODE_MUTEX(parent_inode); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred); RETURN(rc); diff --git a/lustre/mds/mds_join.c b/lustre/mds/mds_join.c index 7a39720..184b965 100644 --- a/lustre/mds/mds_join.c +++ b/lustre/mds/mds_join.c @@ -380,7 +380,7 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req, GOTO(cleanup, rc); } - down(&head_inode->i_sem); + LOCK_INODE_MUTEX(head_inode); cleanup_phase = 1; rc = mds_get_md(obd, head_inode, head_lmm, &size, 0); if (rc < 0) @@ -486,7 +486,7 @@ cleanup: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); case 1: - up(&head_inode->i_sem); + UNLOCK_INODE_MUTEX(head_inode); case 0: if (tail_lmm != NULL) OBD_FREE(tail_lmm, lmm_size); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 7291581..4135e9b 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -141,8 +141,9 @@ int mds_lov_set_nextid(struct obd_device *obd) LASSERT(mds->mds_lov_objids != NULL); - rc = obd_set_info(mds->mds_osc_exp, strlen("next_id"), "next_id", - mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids); + rc = obd_set_info_async(mds->mds_osc_exp, strlen("next_id"), "next_id", + mds->mds_lov_desc.ld_tgt_count, + mds->mds_lov_objids, NULL); RETURN(rc); } @@ -202,7 +203,8 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) OBD_ALLOC(data, sizeof(*data)); if (data == NULL) RETURN(-ENOMEM); - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX | + OBD_CONNECT_REQPORTAL; data->ocd_version = LUSTRE_VERSION_CODE; /* NB: lov_connect() needs to fill in .ocd_index for each OST */ rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data); @@ -459,8 +461,8 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, rc = llog_ioctl(ctxt, cmd, data); pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count); - rc2 = obd_set_info(mds->mds_osc_exp, strlen("mds_conn"), - "mds_conn", 0, NULL); + rc2 = obd_set_info_async(mds->mds_osc_exp, strlen("mds_conn"), + "mds_conn", 0, NULL, NULL); if (!rc) rc = rc2; RETURN(rc); @@ -510,8 +512,8 @@ static int __mds_lov_syncronize(void *data) LASSERT(obd != NULL); - rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"), - "mds_conn", 0, uuid); + rc = obd_set_info_async(obd->u.mds.mds_osc_exp, strlen("mds_conn"), + "mds_conn", 0, uuid, NULL); if (rc != 0) GOTO(out, rc); @@ -524,8 +526,8 @@ static int __mds_lov_syncronize(void *data) GOTO(out, rc); } - CWARN("MDS %s: %s now active, resetting orphans\n", - obd->obd_name, uuid ? (char *)uuid->uuid : "All OSC's"); + LCONSOLE_INFO("MDS %s: %s now active, resetting orphans\n", + obd->obd_name, uuid ? (char *)uuid->uuid : "All OSCs"); if (obd->obd_stopping) GOTO(out, rc = -ENODEV); @@ -545,16 +547,7 @@ out: int mds_lov_synchronize(void *data) { - unsigned long flags; - - lock_kernel(); - ptlrpc_daemonize(); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - unlock_kernel(); + ptlrpc_daemonize("mds_lov_sync"); return (__mds_lov_syncronize(data)); } @@ -582,7 +575,7 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, still disconnected. Taking an obd reference insures that we don't disconnect the LOV. This of course means a cleanup won't finish for as long as the sync is blocking. */ - atomic_inc(&obd->obd_refcount); + class_incref(obd); if (nonblock) { /* Syncronize in the background */ @@ -620,7 +613,7 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, RETURN(-EINVAL); } - uuid = &watched->u.cli.cl_import->imp_target_uuid; + uuid = &watched->u.cli.cl_target_uuid; if (obd->obd_recovering) { /* in the case OBD is in recovery we do not reinit desc and * easize, as that will be done in mds_lov_connect() after diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 632673c..f2c8d1b 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -271,7 +271,7 @@ static struct mds_file_data *mds_dentry_open(struct dentry *dentry, if (error) GOTO(cleanup_mfd, error); body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch; - } else if (flags & FMODE_EXEC) { + } else if (flags & MDS_FMODE_EXEC) { error = mds_deny_write_access(mds, dentry->d_inode); if (error) GOTO(cleanup_mfd, error); @@ -303,7 +303,7 @@ cleanup_dentry: return ERR_PTR(error); } -/* Must be called with i_sem held */ +/* Must be called with i_mutex held */ static int mds_create_objects(struct ptlrpc_request *req, int offset, struct mds_update_record *rec, struct mds_obd *mds, struct obd_device *obd, @@ -660,7 +660,7 @@ static int accmode(struct inode *inode, int flags) res = MAY_READ; if (flags & (FMODE_WRITE|MDS_OPEN_TRUNC)) res |= MAY_WRITE; - if (flags & FMODE_EXEC) + if (flags & MDS_FMODE_EXEC) res = MAY_EXEC; return res; } @@ -679,29 +679,29 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, ENTRY; /* atomically create objects if necessary */ - down(&dchild->d_inode->i_sem); + LOCK_INODE_MUTEX(dchild->d_inode); if (S_ISREG(dchild->d_inode->i_mode) && !(body->valid & OBD_MD_FLEASIZE)) { rc = mds_pack_md(obd, req->rq_repmsg, 2, body, dchild->d_inode, 0); if (rc) { - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); RETURN(rc); } } if (rec != NULL) { if ((body->valid & OBD_MD_FLEASIZE) && (rec->ur_flags & MDS_OPEN_HAS_EA)) { - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); RETURN(-EEXIST); } if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); rc = mds_join_file(rec, req, dchild, lockh); if (rc) RETURN(rc); - down(&dchild->d_inode->i_sem); + LOCK_INODE_MUTEX(dchild->d_inode); } if (!(body->valid & OBD_MD_FLEASIZE) && !(body->valid & OBD_MD_FLMODEASIZE)) { @@ -710,7 +710,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, dchild, handle, &ids); if (rc) { CERROR("mds_create_objects: rc = %d\n", rc); - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); RETURN(rc); } } @@ -721,7 +721,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME | OBD_MD_FLMTIME); } - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE)) lustre_shrink_reply(req, 2, body->eadatasize, 0); @@ -1104,6 +1104,14 @@ found_child: GOTO(cleanup, rc = -EAGAIN); } + if (!S_ISREG(dchild->d_inode->i_mode) && + !S_ISDIR(dchild->d_inode->i_mode) && + (req->rq_export->exp_connect_flags & OBD_CONNECT_NODEVOH)) { + /* If client supports this, do not return open handle for + * special device nodes */ + GOTO(cleanup_no_trans, rc = 0); + } + /* Step 5: mds_open it */ rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec, rep, &parent_lockh); @@ -1147,7 +1155,7 @@ found_child: } /* Close a "file descriptor" and possibly unlink an orphan from the - * PENDING directory. Caller must hold child->i_sem, this drops it. + * PENDING directory. Caller must hold child->i_mutex, this drops it. * * If we are being called from mds_disconnect() because the client has * disappeared, then req == NULL and we do not update last_rcvd because @@ -1190,7 +1198,7 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd, if (mfd->mfd_mode & FMODE_WRITE) { rc = mds_put_write_access(mds, inode, request_body, last_orphan && unlink_orphan); - } else if (mfd->mfd_mode & FMODE_EXEC) { + } else if (mfd->mfd_mode & MDS_FMODE_EXEC) { mds_allow_write_access(inode); } @@ -1210,8 +1218,8 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd, /* Sadly, there is no easy way to save pending_child from * mds_reint_unlink() into mfd, so we need to re-lookup, * but normally it will still be in the dcache. */ - down(&pending_dir->i_sem); - cleanup_phase = 1; /* up(&pending_dir->i_sem) when finished */ + LOCK_INODE_MUTEX(pending_dir); + cleanup_phase = 1; /* UNLOCK_INODE_MUTEX(pending_dir) when finished */ pending_child = lookup_one_len(fidname, mds->mds_pending_dir, fidlen); if (IS_ERR(pending_child)) @@ -1331,7 +1339,7 @@ out: case 2: dput(pending_child); case 1: - up(&pending_dir->i_sem); + UNLOCK_INODE_MUTEX(pending_dir); } RETURN(rc); } diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index e2f7286..d13d7ea 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -113,6 +113,11 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, int log_pri = D_HA; ENTRY; + if (IS_ERR(handle)) { + LASSERT(rc != 0); + RETURN(rc); + } + /* if the export has already been failed, we have no last_rcvd slot */ if (req->rq_export->exp_failed) { CWARN("commit transaction for disconnected client %s: rc %d\n", @@ -124,9 +129,6 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, RETURN(rc); } - if (IS_ERR(handle)) - RETURN(rc); - if (handle == NULL) { /* if we're starting our own xaction, use our own inode */ inode = mds->mds_rcvd_filp->f_dentry->d_inode; @@ -511,7 +513,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); need_lock = 0; } @@ -529,6 +531,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, rc = mds_get_md(obd, inode, lmm, &lmm_size, need_lock); if (rc < 0) GOTO(cleanup, rc); + rc = 0; handle = fsfilt_start_log(obd, inode, FSFILT_OP_SETATTR, NULL, le32_to_cpu(lmm->lmm_stripe_count)); @@ -553,7 +556,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0); /* journal chown/chgrp in llog, just like unlink */ if (rc == 0 && lmm_size){ - cookie_size = mds_get_cookie_size(obd, lmm); + cookie_size = mds_get_cookie_size(obd, lmm); OBD_ALLOC(logcookies, cookie_size); if (logcookies == NULL) GOTO(cleanup, rc = -ENOMEM); @@ -652,7 +655,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, case 1: if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); l_dput(de); if (locked) { if (rc) { @@ -810,7 +813,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, int rdev = rec->ur_rdev; handle = fsfilt_start(obd, dir, FSFILT_OP_MKNOD, NULL); if (IS_ERR(handle)) - GOTO(cleanup, (handle = NULL, rc = PTR_ERR(handle))); + GOTO(cleanup, rc = PTR_ERR(handle)); rc = vfs_mknod(dir, dchild, rec->ur_mode, rdev); EXIT; break; @@ -870,10 +873,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, int lmm_size = sizeof(lmm); rc = mds_get_md(obd, dir, &lmm, &lmm_size, 1); if (rc > 0) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); rc = fsfilt_set_md(obd, inode, handle, &lmm, lmm_size, "lov"); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); } if (rc) CERROR("error on copy stripe info: rc = %d\n", @@ -1039,6 +1042,22 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id, RETURN(0); } +static inline int res_eq(struct ldlm_res_id *res1, struct ldlm_res_id *res2) +{ + return !memcmp(res1, res2, sizeof(*res1)); +} + +static inline void +try_to_aggregate_locks(struct ldlm_res_id *res1, ldlm_policy_data_t *p1, + struct ldlm_res_id *res2, ldlm_policy_data_t *p2) +{ + if (!res_eq(res1, res2)) + return; + /* XXX: any additional inodebits (to current LOOKUP and UPDATE) + * should be taken with great care here */ + p1->l_inodebits.bits |= p2->l_inodebits.bits; +} + int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id, struct lustre_handle *p1_lockh, int p1_lock_mode, ldlm_policy_data_t *p1_policy, @@ -1104,14 +1123,19 @@ int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id, flags = 0; if (res_id[i]->name[0] == 0) break; - if (i != 0 && - memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0 && - (policies[i]->l_inodebits.bits & - policies[i-1]->l_inodebits.bits)) { + if (i && res_eq(res_id[i], res_id[i-1])) { memcpy(dlm_handles[i], dlm_handles[i-1], sizeof(*(dlm_handles[i]))); ldlm_lock_addref(dlm_handles[i], lock_modes[i]); } else { + /* we need to enqueue locks with different inodebits + * at once, because otherwise concurrent thread can + * hit the windown between these two locks and we'll + * get to deadlock. see bug 10360. note also, that it + * is impossible to have >2 equal res. */ + if (i < 3) + try_to_aggregate_locks(res_id[i], policies[i], + res_id[i+1], policies[i+1]); rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, *res_id[i], LDLM_IBITS, policies[i], @@ -1192,8 +1216,11 @@ static int mds_verify_child(struct obd_device *obd, child_res_id->name[0] = dchild->d_inode->i_ino; child_res_id->name[1] = dchild->d_inode->i_generation; - if (res_gt(parent_res_id, child_res_id, NULL, NULL) || - res_gt(maxres, child_res_id, NULL, NULL)) { + /* Make sure that we don't try to re-enqueue a lock on the + * same resource if it happens that the source is renamed to + * the target by another thread (bug 9974, thanks racer :-) */ + if (!res_gt(child_res_id, parent_res_id, NULL, NULL) || + !res_gt(child_res_id, maxres, NULL, NULL)) { CDEBUG(D_DLMTRACE, "relock "LPU64"<("LPU64"|"LPU64")\n", child_res_id->name[0], parent_res_id->name[0], maxres->name[0]); @@ -1308,7 +1335,7 @@ retry_locks: if (rc > 0) goto retry_locks; if (rc < 0) { - cleanup_phase = 3; + cleanup_phase = 2; GOTO(cleanup, rc); } @@ -1342,8 +1369,8 @@ void mds_reconstruct_generic(struct ptlrpc_request *req) * part thereof, because we don't have the inode to check for link * count/open status until after it is locked. * - * For lock ordering, caller must get child->i_sem first, then pending->i_sem - * before starting journal transaction. + * For lock ordering, caller must get child->i_mutex first, then + * pending->i_mutex before starting journal transaction. * * returns 1 on success * returns 0 if we lost a race and didn't make a new link @@ -1363,9 +1390,9 @@ static int mds_orphan_add_link(struct mds_update_record *rec, LASSERT(inode != NULL); LASSERT(!mds_inode_is_orphan(inode)); #ifndef HAVE_I_ALLOC_SEM - LASSERT(down_trylock(&inode->i_sem) != 0); + LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0); #endif - LASSERT(down_trylock(&pending_dir->i_sem) != 0); + LASSERT(TRYLOCK_INODE_MUTEX(pending_dir) == 0); fidlen = ll_fid2str(fidname, inode->i_ino, inode->i_generation); @@ -1541,8 +1568,8 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, child_inode->i_nlink == 1) { if (mds_orphan_open_count(child_inode) > 0) { /* need to lock pending_dir before transaction */ - down(&mds->mds_pending_dir->d_inode->i_sem); - cleanup_phase = 5; /* up(&pending_dir->i_sem) */ + LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); + cleanup_phase = 5; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */ } else if (S_ISREG(child_inode->i_mode)) { mds_pack_inode2fid(&body->fid1, child_inode); mds_pack_inode2body(body, child_inode); @@ -1633,11 +1660,11 @@ cleanup: rc = mds_finish_transno(mds, dparent ? dparent->d_inode : NULL, handle, req, rc, 0); if (!rc) - (void)obd_set_info(mds->mds_osc_exp, strlen("unlinked"), - "unlinked", 0, NULL); + (void)obd_set_info_async(mds->mds_osc_exp, strlen("unlinked"), + "unlinked", 0, NULL, NULL); switch(cleanup_phase) { case 5: /* pending_dir semaphore */ - up(&mds->mds_pending_dir->d_inode->i_sem); + UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); case 4: /* child inode semaphore */ MDS_UP_READ_ORPHAN_SEM(child_inode); case 3: /* child ino-reuse lock */ @@ -1770,10 +1797,8 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, GOTO(cleanup, rc = -EROFS); handle = fsfilt_start(obd, de_tgt_dir->d_inode, FSFILT_OP_LINK, NULL); - if (IS_ERR(handle)) { - rc = PTR_ERR(handle); - GOTO(cleanup, rc); - } + if (IS_ERR(handle)) + GOTO(cleanup, rc = PTR_ERR(handle)); rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild); if (rc && rc != -EPERM && rc != -EACCES) @@ -2104,8 +2129,8 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, new_inode->i_nlink == 1) { if (mds_orphan_open_count(new_inode) > 0) { /* need to lock pending_dir before transaction */ - down(&mds->mds_pending_dir->d_inode->i_sem); - cleanup_phase = 4; /* up(&pending_dir->i_sem) */ + LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); + cleanup_phase = 4; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */ } else if (S_ISREG(new_inode->i_mode)) { mds_pack_inode2fid(&body->fid1, new_inode); mds_pack_inode2body(body, new_inode); @@ -2168,7 +2193,7 @@ cleanup: switch (cleanup_phase) { case 4: - up(&mds->mds_pending_dir->d_inode->i_sem); + UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); case 3: MDS_UP_READ_ORPHAN_SEM(new_inode); case 2: diff --git a/lustre/mds/mds_unlink_open.c b/lustre/mds/mds_unlink_open.c index b877e69..ed4539b 100644 --- a/lustre/mds/mds_unlink_open.c +++ b/lustre/mds/mds_unlink_open.c @@ -222,10 +222,10 @@ int mds_cleanup_pending(struct obd_device *obd) ((namlen == 2) && !strcmp(d_name, "..")) || inum == 0) continue; - down(&pending_dir->i_sem); + LOCK_INODE_MUTEX(pending_dir); dchild = lookup_one_len(d_name, mds->mds_pending_dir, namlen); if (IS_ERR(dchild)) { - up(&pending_dir->i_sem); + UNLOCK_INODE_MUTEX(pending_dir); GOTO(err_out, rc = PTR_ERR(dchild)); } if (!dchild->d_inode) { @@ -264,7 +264,7 @@ int mds_cleanup_pending(struct obd_device *obd) } next: l_dput(dchild); - up(&pending_dir->i_sem); + UNLOCK_INODE_MUTEX(pending_dir); } rc = 0; err_out: diff --git a/lustre/mds/mds_xattr.c b/lustre/mds/mds_xattr.c index 5c8de13..45884c5 100644 --- a/lustre/mds/mds_xattr.c +++ b/lustre/mds/mds_xattr.c @@ -199,6 +199,10 @@ out_ucred: return rc; } +/* + * alwasy return 0, and set req->rq_status as error number in case + * of failures. + */ static int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) { @@ -225,20 +229,11 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) lockpart = MDS_INODELOCK_UPDATE; - de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX, - &lockh, NULL, 0, lockpart); - if (IS_ERR(de)) - GOTO(out, rc = PTR_ERR(de)); - - inode = de->d_inode; - LASSERT(inode); - - OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb); - + /* various sanity check for xattr name */ xattr_name = lustre_msg_string(req->rq_reqmsg, 1, 0); if (!xattr_name) { CERROR("can't extract xattr name\n"); - GOTO(out_dput, rc = -EPROTO); + GOTO(out, rc = -EPROTO); } DEBUG_REQ(D_INODE, req, "%sxattr %s\n", @@ -247,14 +242,27 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) if (strncmp(xattr_name, "trusted.", 8) == 0) { if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0) - GOTO(out_dput, rc = -EACCES); + GOTO(out, rc = -EACCES); } if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) && (strncmp(xattr_name, "user.", 5) == 0)) { - GOTO(out_dput, rc = -EOPNOTSUPP); + GOTO(out, rc = -EOPNOTSUPP); } + if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS)) + lockpart |= MDS_INODELOCK_LOOKUP; + + de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX, + &lockh, NULL, 0, lockpart); + if (IS_ERR(de)) + GOTO(out, rc = PTR_ERR(de)); + + inode = de->d_inode; + LASSERT(inode); + + OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb); + /* filter_op simply use setattr one */ handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL); if (IS_ERR(handle)) @@ -272,20 +280,20 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) xattr = lustre_msg_buf(req->rq_reqmsg, 2, xattrlen); - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); lock_24kernel(); rc = inode->i_op->setxattr(de, xattr_name, xattr, xattrlen, body->flags); unlock_24kernel(); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); } } else if (body->valid & OBD_MD_FLXATTRRM) { if (inode->i_op && inode->i_op->removexattr) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); lock_24kernel(); rc = inode->i_op->removexattr(de, xattr_name); unlock_24kernel(); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); } } else { CERROR("valid bits: "LPX64"\n", body->valid); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index fc3f8fd..180ce86 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -66,7 +66,6 @@ unsigned int obd_timeout = 100; /* seconds */ unsigned int ldlm_timeout = 20; /* seconds */ unsigned int obd_health_check_timeout = 120; /* seconds */ char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall */ -unsigned int obd_sync_filter; /* = 0, don't sync by default */ cfs_waitq_t obd_race_waitq; @@ -381,9 +380,7 @@ EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); EXPORT_SYMBOL(obd_health_check_timeout); EXPORT_SYMBOL(obd_lustre_upcall); -EXPORT_SYMBOL(obd_sync_filter); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); -EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack); EXPORT_SYMBOL(proc_lustre_root); @@ -414,6 +411,7 @@ EXPORT_SYMBOL(class_handle_unhash); EXPORT_SYMBOL(class_handle2object); /* config.c */ +EXPORT_SYMBOL(class_incref); EXPORT_SYMBOL(class_decref); EXPORT_SYMBOL(class_get_profile); EXPORT_SYMBOL(class_del_profile); @@ -560,11 +558,9 @@ int init_obdclass(void) /* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this * ifdef to the end of the file to cover module and versioning goo.*/ #ifdef __KERNEL__ - static void cleanup_obdclass(void) { int i; - int leaked; ENTRY; cfs_psdev_deregister(&obd_psdev); @@ -584,11 +580,6 @@ static void cleanup_obdclass(void) class_handle_cleanup(); class_exit_uuidlist(); - - leaked = atomic_read(&obd_memory); - CDEBUG(leaked ? D_ERROR : D_INFO, - "obd mem max: %d leaked: %d\n", obd_memmax, leaked); - EXIT; } diff --git a/lustre/obdclass/darwin/darwin-sysctl.c b/lustre/obdclass/darwin/darwin-sysctl.c index 0a58cb5..59b7e45 100644 --- a/lustre/obdclass/darwin/darwin-sysctl.c +++ b/lustre/obdclass/darwin/darwin-sysctl.c @@ -56,9 +56,6 @@ SYSCTL_STRING(_lustre, OID_AUTO, upcall, SYSCTL_INT(_lustre, OID_AUTO, memused, CTLTYPE_INT | CTLFLAG_RW, (int *)&obd_memory.counter, 0, "lustre_memory_used"); -SYSCTL_INT(_lustre, OID_AUTO, filter_sync_on_commit, - CTLTYPE_INT | CTLFLAG_RW, &obd_sync_filter, - 0, "filter_sync_on_commit"); SYSCTL_INT(_lustre, OID_AUTO, ldlm_timeout, CTLTYPE_INT | CTLFLAG_RW, &ldlm_timeout, 0, "ldlm_timeout"); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 0d7479f..361c2d4 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -41,7 +41,6 @@ EXPORT_SYMBOL(obdo_cachep); cfs_mem_cache_t *import_cachep = NULL; int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); -void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp); /* * support functions: we could use inter-module communication, but this @@ -196,8 +195,8 @@ struct obd_device *class_newdev(struct obd_type *type, char *name) obd->obd_minor = i; obd->obd_type = type; obd->obd_name = name; - CDEBUG(D_IOCTL, "Adding new device %s\n", - obd->obd_name); + CDEBUG(D_IOCTL, "Adding new device %s (%p)\n", + obd->obd_name, obd); result = obd; } } @@ -291,9 +290,8 @@ struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, continue; if ((strncmp(obd->obd_type->typ_name, typ_name, strlen(typ_name)) == 0)) { - struct client_obd *cli = &obd->u.cli; - struct obd_import *imp = cli->cl_import; - if (obd_uuid_equals(tgt_uuid, &imp->imp_target_uuid) && + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && ((grp_uuid)? obd_uuid_equals(grp_uuid, &obd->obd_uuid) : 1)) { spin_unlock(&obd_dev_lock); @@ -519,7 +517,7 @@ struct obd_export *class_new_export(struct obd_device *obd, } } LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */ - atomic_inc(&obd->obd_refcount); + class_incref(obd); list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); list_add_tail(&export->exp_obd_chain_timed, &export->exp_obd->obd_exports_timed); @@ -590,12 +588,13 @@ void class_import_put(struct obd_import *import) } LASSERT(list_empty(&import->imp_handle.h_link)); + class_decref(import->imp_obd); OBD_FREE(import, sizeof(*import)); EXIT; } EXPORT_SYMBOL(class_import_put); -struct obd_import *class_new_import(void) +struct obd_import *class_new_import(struct obd_device *obd) { struct obd_import *imp; @@ -607,10 +606,8 @@ struct obd_import *class_new_import(void) CFS_INIT_LIST_HEAD(&imp->imp_sending_list); CFS_INIT_LIST_HEAD(&imp->imp_delayed_list); spin_lock_init(&imp->imp_lock); - imp->imp_conn_cnt = 0; - imp->imp_max_transno = 0; - imp->imp_peer_committed_transno = 0; imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd); cfs_waitq_init(&imp->imp_recovery_waitq); atomic_set(&imp->imp_refcount, 2); @@ -631,13 +628,7 @@ void class_destroy_import(struct obd_import *import) class_handle_unhash(&import->imp_handle); - /* Abort any inflight DLM requests and NULL out their (about to be - * freed) import. */ - /* Invalidate all requests on import, would be better to call - ptlrpc_set_import_active(imp, 0); */ import->imp_generation++; - ptlrpc_abort_inflight_superhack(import); - class_import_put(import); } EXPORT_SYMBOL(class_destroy_import); @@ -711,7 +702,7 @@ static void class_disconnect_export_list(struct list_head *list, int flags) /* It's possible that an export may disconnect itself, but * nothing else will be added to this list. */ - while(!list_empty(list)) { + while (!list_empty(list)) { exp = list_entry(list->next, struct obd_export, exp_obd_chain); class_export_get(exp); exp->exp_flags = flags; @@ -981,234 +972,6 @@ char *obd_export_nid2str(struct obd_export *exp) } EXPORT_SYMBOL(obd_export_nid2str); -/* Ping evictor thread */ -#ifdef __KERNEL__ -#define PET_READY 1 -#define PET_TERMINATE 2 - -static int pet_refcount = 0; -static int pet_state; -static cfs_waitq_t pet_waitq; -static struct obd_export *pet_exp = NULL; -static spinlock_t pet_lock; - -static int ping_evictor_wake(struct obd_export *exp) -{ - spin_lock(&pet_lock); - if (pet_exp) { - /* eventually the new obd will call here again. */ - spin_unlock(&pet_lock); - return 1; - } - - /* We have to make sure the obd isn't destroyed between now and when - * the ping evictor runs. We'll take a reference here, and drop it - * when we finish in the evictor. We don't really care about this - * export in particular; we just need one to keep the obd alive. */ - pet_exp = class_export_get(exp); - spin_unlock(&pet_lock); - - cfs_waitq_signal(&pet_waitq); - return 0; -} - -static int ping_evictor_main(void *arg) -{ - struct obd_device *obd; - struct obd_export *exp; - struct l_wait_info lwi = { 0 }; - time_t expire_time; - ENTRY; - - lock_kernel(); - - /* ptlrpc_daemonize() */ - exit_mm(current); - lustre_daemonize_helper(); - set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd); - exit_files(current); - reparent_to_init(); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX-1, "ping_evictor"); - - cfs_block_allsigs(); - unlock_kernel(); - - CDEBUG(D_HA, "Starting Ping Evictor\n"); - pet_exp = NULL; - pet_state = PET_READY; - while (1) { - l_wait_event(pet_waitq, pet_exp || - (pet_state == PET_TERMINATE), &lwi); - if (pet_state == PET_TERMINATE) - break; - - /* we only get here if pet_exp != NULL, and the end of this - * loop is the only place which sets it NULL again, so lock - * is not strictly necessary. */ - spin_lock(&pet_lock); - obd = pet_exp->exp_obd; - spin_unlock(&pet_lock); - - expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2); - - CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", - obd->obd_name, expire_time); - - /* Exports can't be deleted out of the list while we hold - * the obd lock (class_unlink_export), which means we can't - * lose the last ref on the export. If they've already been - * removed from the list, we won't find them here. */ - spin_lock(&obd->obd_dev_lock); - while (!list_empty(&obd->obd_exports_timed)) { - exp = list_entry(obd->obd_exports_timed.next, - struct obd_export,exp_obd_chain_timed); - - if (expire_time > exp->exp_last_request_time) { - class_export_get(exp); - spin_unlock(&obd->obd_dev_lock); - LCONSOLE_WARN("%s: haven't heard from %s in %ld" - " seconds. Last request was at %ld. " - "I think it's dead, and I am evicting " - "it.\n", obd->obd_name, - obd_export_nid2str(exp), - (long)(CURRENT_SECONDS - - exp->exp_last_request_time), - exp->exp_last_request_time); - - - class_fail_export(exp); - class_export_put(exp); - - spin_lock(&obd->obd_dev_lock); - } else { - /* List is sorted, so everyone below is ok */ - break; - } - } - spin_unlock(&obd->obd_dev_lock); - - class_export_put(pet_exp); - - spin_lock(&pet_lock); - pet_exp = NULL; - spin_unlock(&pet_lock); - } - CDEBUG(D_HA, "Exiting Ping Evictor\n"); - - RETURN(0); -} - -void ping_evictor_start(void) -{ - int rc; - - if (++pet_refcount > 1) - return; - - spin_lock_init(&pet_lock); - cfs_waitq_init(&pet_waitq); - - rc = cfs_kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS); - if (rc < 0) { - pet_refcount--; - CERROR("Cannot start ping evictor thread: %d\n", rc); - } -} -EXPORT_SYMBOL(ping_evictor_start); - -void ping_evictor_stop(void) -{ - if (--pet_refcount > 0) - return; - - pet_state = PET_TERMINATE; - cfs_waitq_signal(&pet_waitq); -} -EXPORT_SYMBOL(ping_evictor_stop); -#else /* !__KERNEL__ */ -#define ping_evictor_wake(exp) 1 -#endif - -/* This function makes sure dead exports are evicted in a timely manner. - This function is only called when some export receives a message (i.e., - the network is up.) */ -void class_update_export_timer(struct obd_export *exp, time_t extra_delay) -{ - struct obd_export *oldest_exp; - time_t oldest_time; - - ENTRY; - - LASSERT(exp); - - /* Compensate for slow machines, etc, by faking our request time - into the future. Although this can break the strict time-ordering - of the list, we can be really lazy here - we don't have to evict - at the exact right moment. Eventually, all silent exports - will make it to the top of the list. */ - exp->exp_last_request_time = max(exp->exp_last_request_time, - (time_t)CURRENT_SECONDS + extra_delay); - - CDEBUG(D_INFO, "updating export %s at %ld\n", - exp->exp_client_uuid.uuid, - exp->exp_last_request_time); - - /* exports may get disconnected from the chain even though the - export has references, so we must keep the spin lock while - manipulating the lists */ - spin_lock(&exp->exp_obd->obd_dev_lock); - - if (list_empty(&exp->exp_obd_chain_timed)) { - /* this one is not timed */ - spin_unlock(&exp->exp_obd->obd_dev_lock); - EXIT; - return; - } - - list_move_tail(&exp->exp_obd_chain_timed, - &exp->exp_obd->obd_exports_timed); - - oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, - struct obd_export, exp_obd_chain_timed); - oldest_time = oldest_exp->exp_last_request_time; - spin_unlock(&exp->exp_obd->obd_dev_lock); - - if (exp->exp_obd->obd_recovering) { - /* be nice to everyone during recovery */ - EXIT; - return; - } - - /* Note - racing to start/reset the obd_eviction timer is safe */ - if (exp->exp_obd->obd_eviction_timer == 0) { - /* Check if the oldest entry is expired. */ - if (CURRENT_SECONDS > (oldest_time + - (3 * obd_timeout / 2) + extra_delay)) { - /* We need a second timer, in case the net was down and - * it just came back. Since the pinger may skip every - * other PING_INTERVAL (see note in ptlrpc_pinger_main), - * we better wait for 3. */ - exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + - 3 * PING_INTERVAL; - CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n", - exp->exp_obd->obd_name, obd_export_nid2str(exp), - oldest_time); - } - } else { - if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer + - extra_delay)) { - /* The evictor won't evict anyone who we've heard from - * recently, so we don't have to check before we start - * it. */ - if (!ping_evictor_wake(exp)) - exp->exp_obd->obd_eviction_timer = 0; - } - } - - EXIT; -} -EXPORT_SYMBOL(class_update_export_timer); - #define EVICT_BATCH 32 int obd_export_evict_by_nid(struct obd_device *obd, char *nid) { diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c index 9b95f02..517035c 100644 --- a/lustre/obdclass/linux/linux-module.c +++ b/lustre/obdclass/linux/linux-module.c @@ -258,7 +258,7 @@ static int obd_proc_read_health(char *page, char **start, off_t off, if (obd->obd_type == NULL) continue; - atomic_inc(&obd->obd_refcount); + class_incref(obd); spin_unlock(&obd_dev_lock); if (obd_health_check(obd)) { diff --git a/lustre/obdclass/linux/linux-obdo.c b/lustre/obdclass/linux/linux-obdo.c index f4e8fea..b5db22d 100644 --- a/lustre/obdclass/linux/linux-obdo.c +++ b/lustre/obdclass/linux/linux-obdo.c @@ -124,11 +124,6 @@ void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid) attr->ia_gid = oa->o_gid; attr->ia_valid |= ATTR_GID; } - - if (valid & OBD_MD_FLFLAGS) { - attr->ia_attr_flags = oa->o_flags; - attr->ia_valid |= ATTR_ATTR_FLAG; - } } EXPORT_SYMBOL(iattr_from_obdo); @@ -247,8 +242,12 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) LTIME_S(dst->i_ctime) = src->o_ctime; if (valid & OBD_MD_FLSIZE) dst->i_size = src->o_size; - if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */ dst->i_blocks = src->o_blocks; + if (dst->i_blocks < src->o_blocks) /* overflow */ + dst->i_blocks = -1; + + } if (valid & OBD_MD_FLBLKSZ) dst->i_blksize = src->o_blksize; if (valid & OBD_MD_FLTYPE) diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c index 0f5a25d..169aecb 100644 --- a/lustre/obdclass/linux/linux-sysctl.c +++ b/lustre/obdclass/linux/linux-sysctl.c @@ -107,8 +107,6 @@ static ctl_table obd_table[] = { &proc_dostring, &sysctl_string }, {OBD_MEMUSED, "memused", (int *)&obd_memory.counter, sizeof(int), 0644, NULL, &proc_dointvec}, - {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int), - 0644, NULL, &proc_dointvec}, {OBD_LDLM_TIMEOUT, "ldlm_timeout", &ldlm_timeout, sizeof(int), 0644, NULL, &proc_set_timeout}, { 0 } diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 594a00f..6d56707 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -653,9 +653,9 @@ static int llog_lvfs_destroy(struct llog_handle *handle) rc = llog_lvfs_close(handle); if (rc == 0) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); rc = vfs_unlink(inode, fdentry); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); } dput(fdentry); diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c index c987642..4833c29 100644 --- a/lustre/obdclass/llog_obd.c +++ b/lustre/obdclass/llog_obd.c @@ -55,7 +55,7 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd, obd->obd_llog_ctxt[index] = ctxt; ctxt->loc_obd = obd; - ctxt->loc_exp = disk_obd->obd_self_export; + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); ctxt->loc_idx = index; ctxt->loc_logops = op; sema_init(&ctxt->loc_sem, 1); @@ -81,6 +81,8 @@ int llog_cleanup(struct llog_ctxt *ctxt) rc = CTXTP(ctxt, cleanup)(ctxt); ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL; + if (ctxt->loc_exp) + class_export_put(ctxt->loc_exp); OBD_FREE(ctxt, sizeof(*ctxt)); RETURN(rc); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index c7ea976..59979690 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -329,7 +329,7 @@ int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count, imp_state_name = ptlrpc_import_state_name(imp->imp_state); *eof = 1; return snprintf(page, count, "%s\t%s%s\n", - imp->imp_target_uuid.uuid, imp_state_name, + obd2cli_tgt(obd), imp_state_name, imp->imp_deactive ? "\tDEACTIVATED" : ""); } @@ -361,6 +361,8 @@ static const char *obd_connect_names[] = { "initial_transno", "inode_bit_locks", "join_file", + "", + "no_oh_for_devices", NULL }; @@ -652,7 +654,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol); LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async); LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach); LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach); LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 8a729c3..6befd70 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -115,7 +115,6 @@ int class_attach(struct lustre_cfg *lcfg) CFS_INIT_LIST_HEAD(&obd->obd_exports); CFS_INIT_LIST_HEAD(&obd->obd_exports_timed); - obd->obd_num_exports = 0; spin_lock_init(&obd->obd_dev_lock); spin_lock_init(&obd->obd_osfs_lock); obd->obd_osfs_age = cfs_time_shift(-1000); @@ -151,8 +150,8 @@ int class_attach(struct lustre_cfg *lcfg) obd->obd_attached = 1; type->typ_refcnt++; - CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n", - obd->obd_minor, typename); + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); RETURN(0); out: switch (cleanup_phase) { @@ -214,7 +213,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) obd->obd_set_up = 1; spin_lock(&obd->obd_dev_lock); /* cleanup drops this */ - atomic_inc(&obd->obd_refcount); + class_incref(obd); spin_unlock(&obd->obd_dev_lock); CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", @@ -389,6 +388,15 @@ out: RETURN(err); } +struct obd_device *class_incref(struct obd_device *obd) +{ + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount)); + + return obd; +} + void class_decref(struct obd_device *obd) { int err; @@ -399,7 +407,7 @@ void class_decref(struct obd_device *obd) refs = atomic_read(&obd->obd_refcount); spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_INFO, "Decref %s now %d\n", obd->obd_name, refs); + CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs); if ((refs == 1) && obd->obd_stopping) { /* All exports (other than the self-export) have been diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index ccfa21a..22d43f7 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -281,11 +281,11 @@ echo_get_object (struct ec_object **ecop, struct obd_device *obd, spin_lock (&ec->ec_lock); eco = echo_find_object_locked (obd, oa->o_id); if (eco != NULL) { - if (eco->eco_deleted) { /* being deleted */ - spin_unlock(&ec->ec_lock); /* (see comment in cleanup) */ + if (eco->eco_deleted) { /* being deleted */ + spin_unlock(&ec->ec_lock);/* (see comment in cleanup) */ return (-EAGAIN); } - + eco->eco_refcount++; spin_unlock (&ec->ec_lock); *ecop = eco; @@ -1353,7 +1353,8 @@ echo_client_setup(struct obd_device *obddev, obd_count len, void *buf) lustre_cfg_string(lcfg, 1)); return -ENOMEM; } - + + ocd->ocd_connect_flags = OBD_CONNECT_VERSION; ocd->ocd_version = LUSTRE_VERSION_CODE; rc = obd_connect(&conn, tgt, &echo_uuid, ocd); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index c0b5263..458c355 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -24,12 +24,12 @@ */ /* - * Invariant: Get O/R i_sem for lookup, if needed, before any journal ops + * Invariant: Get O/R i_mutex for lookup, if needed, before any journal ops * (which need to get journal_lock, may block if journal full). * * Invariant: Call filter_start_transno() before any journal ops to avoid the * same deadlock problem. We can (and want) to get rid of the - * transno sem in favour of the dir/inode i_sem to avoid single + * transno sem in favour of the dir/inode i_mutex to avoid single * threaded operation on the OST. */ @@ -412,8 +412,10 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) GOTO(err_fsd, rc); } if (strcmp(fsd->fsd_uuid, obd->obd_uuid.uuid) != 0) { - CERROR("OBD UUID %s does not match last_rcvd UUID %s\n", - obd->obd_uuid.uuid, fsd->fsd_uuid); + LCONSOLE_ERROR("Trying to start OBD %s using the wrong" + " disk %s. Were the /dev/ assignments " + "rearranged?\n", + obd->obd_uuid.uuid, fsd->fsd_uuid); GOTO(err_fsd, rc = -EINVAL); } mount_count = le64_to_cpu(fsd->fsd_mount_count); @@ -648,10 +650,10 @@ static int filter_prep_groups(struct obd_device *obd) GOTO(cleanup_O0, rc = -EEXIST); } - down(&O_dentry->d_inode->i_sem); + LOCK_INODE_MUTEX(O_dentry->d_inode); rc = vfs_rename(O_dentry->d_inode, dentry, O_dentry->d_inode, O0_dentry); - up(&O_dentry->d_inode->i_sem); + UNLOCK_INODE_MUTEX(O_dentry->d_inode); if (rc) { CERROR("error renaming O/R to O/0: rc %d\n", rc); @@ -913,7 +915,7 @@ __u64 filter_last_id(struct filter_obd *filter, struct obdo *oa) static int filter_lock_dentry(struct obd_device *obd, struct dentry *dparent) { - down(&dparent->d_inode->i_sem); + LOCK_INODE_MUTEX(dparent->d_inode); return 0; } @@ -948,7 +950,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group, /* We never dget the object parent, so DON'T dput it either */ static void filter_parent_unlock(struct dentry *dparent) { - up(&dparent->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dparent->d_inode); } /* How to get files, dentries, inodes from object id's. @@ -1045,9 +1047,10 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry) ENTRY; /* don't need dir->i_zombie for 2.4, it is for rename/unlink of dir - * itself we already hold dir->i_sem for child create/unlink ops */ - LASSERT(down_trylock(&dir->i_sem) != 0); - LASSERT(down_trylock(&dentry->d_inode->i_sem) != 0); + * itself we already hold dir->i_mutex for child create/unlink ops */ + LASSERT(TRYLOCK_INODE_MUTEX(dir) == 0); + LASSERT(TRYLOCK_INODE_MUTEX(dentry->d_inode) == 0); + /* may_delete() */ if (!dentry->d_inode || dentry->d_parent->d_inode != dir) @@ -1065,7 +1068,7 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry) IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode)) GOTO(out, rc = -EPERM); - /* NOTE: This might need to go outside i_sem, though it isn't clear if + /* NOTE: This might need to go outside i_mutex, though it isn't clear if * that was done because of journal_start (which is already done * here) or some other ordering issue. */ DQUOT_INIT(dir); @@ -1078,8 +1081,8 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry) rc = dir->i_op->unlink(dir, dentry); out: - /* need to drop i_sem before we lose inode reference */ - up(&dentry->d_inode->i_sem); + /* need to drop i_mutex before we lose inode reference */ + UNLOCK_INODE_MUTEX(dentry->d_inode); if (rc == 0) d_delete(dentry); @@ -1087,7 +1090,7 @@ out: } /* Caller must hold LCK_PW on parent and push us into kernel context. - * Caller must hold child i_sem, we drop it always. + * Caller must hold child i_mutex, we drop it always. * Caller is also required to ensure that dchild->d_inode exists. */ static int filter_destroy_internal(struct obd_device *obd, obd_id objid, struct dentry *dparent, @@ -1422,14 +1425,12 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); obd->obd_replayable = 1; - obd_sync_filter = 1; if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { str = lustre_cfg_string(lcfg, 3); if (strchr(str, 'n')) { CWARN("%s: recovery disabled\n", obd->obd_name); obd->obd_replayable = 0; - obd_sync_filter = 0; } } @@ -1573,8 +1574,6 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf) lproc_filter_attach_seqstat(obd); } - ping_evictor_start(); - return rc; } @@ -1630,17 +1629,23 @@ static int filter_llog_finish(struct obd_device *obd, int count) RETURN(rc); } -static int filter_precleanup(struct obd_device *obd, int stage) +static int filter_precleanup(struct obd_device *obd, + enum obd_cleanup_stage stage) { int rc = 0; ENTRY; switch(stage) { + case OBD_CLEANUP_EARLY: + break; case OBD_CLEANUP_EXPORTS: target_cleanup_recovery(obd); break; case OBD_CLEANUP_SELF_EXP: rc = filter_llog_finish(obd, 0); + break; + case OBD_CLEANUP_OBD: + break; } RETURN(rc); } @@ -1665,8 +1670,6 @@ static int filter_cleanup(struct obd_device *obd) } } - ping_evictor_stop(); - lquota_cleanup(quota_interface, obd); ldlm_namespace_free(obd->obd_namespace, obd->obd_force); @@ -1966,6 +1969,9 @@ static int filter_destroy_export(struct obd_export *exp) target_destroy_export(exp); + if (obd_uuid_equals(&exp->exp_client_uuid, &exp->exp_obd->obd_uuid)) + RETURN(0); + if (exp->exp_obd->obd_replayable) filter_client_free(exp); else @@ -2134,7 +2140,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, } if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) { - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); locked = 1; } @@ -2195,7 +2201,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, } if (locked) { - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); locked = 0; } @@ -2210,7 +2216,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, EXIT; out_unlock: if (locked) - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); /* trigger quota release */ if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) { @@ -2680,7 +2686,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, unsigned int qcids[MAXQUOTAS] = {0, 0}; struct obd_device *obd; struct filter_obd *filter; - struct dentry *dchild = NULL, *dparent; + struct dentry *dchild = NULL, *dparent = NULL; struct lvfs_run_ctxt saved; void *handle = NULL; struct llog_cookie *fcc = NULL; @@ -2731,11 +2737,11 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, * restart transaction * (see BUG 4180) -bzzz */ - down(&dchild->d_inode->i_sem); + LOCK_INODE_MUTEX(dchild->d_inode); handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR, NULL, 1); if (IS_ERR(handle)) { - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); GOTO(cleanup, rc = PTR_ERR(handle)); } @@ -2743,7 +2749,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, iattr.ia_size = 0; rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1); rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0); - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); if (rc) GOTO(cleanup, rc); if (rc2) @@ -2758,10 +2764,10 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, GOTO(cleanup, rc = PTR_ERR(dparent)); cleanup_phase = 3; /* filter_parent_unlock */ - down(&dchild->d_inode->i_sem); + LOCK_INODE_MUTEX(dchild->d_inode); handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1); if (IS_ERR(handle)) { - up(&dchild->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dchild->d_inode); GOTO(cleanup, rc = PTR_ERR(handle)); } cleanup_phase = 4; /* fsfilt_commit */ @@ -2769,7 +2775,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, /* Quota release need uid/gid of inode */ obdo_from_inode(oa, dchild->d_inode, OBD_MD_FLUID|OBD_MD_FLGID); - /* this drops dchild->d_inode->i_sem unconditionally */ + /* this drops dchild->d_inode->i_mutex unconditionally */ rc = filter_destroy_internal(obd, oa->o_id, dparent, dchild); EXIT; @@ -2859,7 +2865,8 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa, push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - down(&dentry->d_inode->i_sem); + LOCK_INODE_MUTEX(dentry->d_inode); + rc = filemap_fdatawrite(dentry->d_inode->i_mapping); if (rc == 0) { /* just any file to grab fsync method - "file" arg unused */ @@ -2872,7 +2879,7 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa, if (!rc) rc = rc2; } - up(&dentry->d_inode->i_sem); + UNLOCK_INODE_MUTEX(dentry->d_inode); oa->o_valid = OBD_MD_FLID; obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS); @@ -2921,8 +2928,9 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen, RETURN(-EINVAL); } -static int filter_set_info(struct obd_export *exp, __u32 keylen, - void *key, __u32 vallen, void *val) +static int filter_set_info_async(struct obd_export *exp, __u32 keylen, + void *key, __u32 vallen, void *val, + struct ptlrpc_request_set *set) { struct obd_device *obd; struct llog_ctxt *ctxt; @@ -3051,7 +3059,7 @@ static struct lvfs_callback_ops filter_lvfs_ops = { static struct obd_ops filter_obd_ops = { .o_owner = THIS_MODULE, .o_get_info = filter_get_info, - .o_set_info = filter_set_info, + .o_set_info_async = filter_set_info_async, .o_setup = filter_setup, .o_precleanup = filter_precleanup, .o_cleanup = filter_cleanup, @@ -3080,7 +3088,7 @@ static struct obd_ops filter_obd_ops = { static struct obd_ops filter_sanobd_ops = { .o_owner = THIS_MODULE, .o_get_info = filter_get_info, - .o_set_info = filter_set_info, + .o_set_info_async = filter_set_info_async, .o_setup = filter_san_setup, .o_precleanup = filter_precleanup, .o_cleanup = filter_cleanup, diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index d369be3..ae83fb9 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -467,7 +467,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, CERROR("Failure to commit OST transaction (%d)?\n", err); rc = err; } - if (obd_sync_filter && !err) + if (obd->obd_replayable && !err) LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index b96eebb..b9975fc 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -390,11 +390,15 @@ static int filter_clear_page_cache(struct inode *inode, rc = generic_osync_inode(inode, inode->i_mapping, OSYNC_DATA|OSYNC_METADATA); */ + down(&inode->i_sem); + current->flags |= PF_SYNCWRITE; rc = filemap_fdatawrite(inode->i_mapping); rc2 = sync_mapping_buffers(inode->i_mapping); if (rc == 0) rc = rc2; rc2 = filemap_fdatawait(inode->i_mapping); + current->flags &= ~PF_SYNCWRITE; + up(&inode->i_sem); if (rc == 0) rc = rc2; if (rc != 0) @@ -419,7 +423,7 @@ static int filter_clear_page_cache(struct inode *inode, return 0; } -/* Must be called with i_sem taken for writes; this will drop it */ +/* Must be called with i_mutex taken for writes; this will drop it */ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle) @@ -479,7 +483,7 @@ remap: oti->oti_handle, attr, 0); } - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); rc2 = filter_finish_transno(exp, oti, 0); if (rc2 != 0) { @@ -593,12 +597,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); cleanup_phase = 2; - down(&inode->i_sem); - fsfilt_check_slow(now, obd_timeout, "i_sem"); + LOCK_INODE_MUTEX(inode); + fsfilt_check_slow(now, obd_timeout, "i_mutex"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); if (IS_ERR(oti->oti_handle)) { - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); rc = PTR_ERR(oti->oti_handle); CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, "error starting transaction: rc = %d\n", rc); @@ -637,7 +641,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, rc = filter_update_fidea(exp, inode, oti->oti_handle, oa); } - /* filter_direct_io drops i_sem */ + /* filter_direct_io drops i_mutex */ rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr, oti, &wait_handle); if (rc == 0) @@ -654,7 +658,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (err) rc = err; - if (obd_sync_filter && !err) + if (obd->obd_replayable && !err) LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 4a797c9..c61be24 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -51,14 +51,14 @@ int filter_log_sz_change(struct llog_handle *cathandle, struct ost_filterdata *ofd; ENTRY; - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); ofd = inode->i_filterdata; if (ofd && ofd->ofd_epoch >= io_epoch) { if (ofd->ofd_epoch > io_epoch) CERROR("client sent old epoch %d for obj ino %ld\n", io_epoch, inode->i_ino); - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); RETURN(0); } @@ -73,7 +73,7 @@ int filter_log_sz_change(struct llog_handle *cathandle, ofd->ofd_epoch = io_epoch; } /* the decision to write a record is now made, unlock */ - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); OBD_ALLOC(lsc, sizeof(*lsc)); if (lsc == NULL) diff --git a/lustre/obdfilter/filter_lvb.c b/lustre/obdfilter/filter_lvb.c index c95d295..06946fe 100644 --- a/lustre/obdfilter/filter_lvb.c +++ b/lustre/obdfilter/filter_lvb.c @@ -49,7 +49,7 @@ static int filter_lvbo_init(struct ldlm_resource *res) ENTRY; LASSERT(res); - LASSERT(down_trylock(&res->lr_lvb_sem) != 0); + LASSERT_SEM_LOCKED(&res->lr_lvb_sem); /* we only want lvb's for object resources */ /* check for internal locks: these have name[1] != 0 */ diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 7a5df98..764c55c 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -86,6 +86,7 @@ static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer, { struct obd_device *dev = data; struct client_obd *cli = &dev->u.cli; + struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool; int val, rc; rc = lprocfs_write_helper(buffer, count, &val); @@ -95,9 +96,8 @@ static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer, if (val < 1 || val > OSC_MAX_RIF_MAX) return -ERANGE; - if (cli->cl_rq_pool && val > cli->cl_max_rpcs_in_flight) - cli->cl_rq_pool->prp_populate(cli->cl_rq_pool, - val - cli->cl_max_rpcs_in_flight); + if (pool && val > cli->cl_max_rpcs_in_flight) + pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight); client_obd_list_lock(&cli->cl_loi_list_lock); cli->cl_max_rpcs_in_flight = val; diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 491f2d9..d21c3e8 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -80,7 +80,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) spin_unlock(&oscc->oscc_lock); DEBUG_REQ(D_ERROR, req, "unknown rc %d from async create: failing oscc", rc); - ptlrpc_fail_import(req->rq_import, req->rq_import_generation); + ptlrpc_fail_import(req->rq_import, req->rq_reqmsg->conn_cnt); } else { if (rc == 0) { oscc->oscc_flags &= ~OSCC_FLAG_LOW; @@ -347,8 +347,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa, if (rc == 0) CDEBUG(D_HA, "%s: returning objid "LPU64"\n", - oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid, - lsm->lsm_object_id); + obd2cli_tgt(oscc->oscc_obd), lsm->lsm_object_id); else if (*ea == NULL) obd_free_memmd(exp, &lsm); RETURN(rc); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 60cc4be..0f2321a 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -773,7 +773,7 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, ENTRY; opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ; - pool = ((cmd & OBD_BRW_WRITE) != 0) ? cli->cl_rq_pool : NULL; + pool = ((cmd & OBD_BRW_WRITE) != 0) ? imp->imp_rq_pool : NULL; for (niocount = i = 1; i < page_count; i++) if (!can_merge_pages(&pga[i - 1], &pga[i])) @@ -826,9 +826,9 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64 " prev_pg %p [pri %lu ind %lu] off "LPU64"\n", i, page_count, - pg->pg, pg->pg->private, pg->pg->index, pg->off, - pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index, - pg_prev->off); + pg->pg, page_private(pg->pg), pg->pg->index, pg->off, + pg_prev->pg, page_private(pg_prev->pg), + pg_prev->pg->index, pg_prev->off); #else LASSERTF(i == 0 || pg->off > pg_prev->off, "i %d p_c %u\n", i, page_count); @@ -1200,14 +1200,12 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, *oa = *saved_oa; } else if (page_count > pages_per_brw) { /* save a copy of oa (brw will clobber it) */ - OBD_ALLOC(saved_oa, sizeof(*saved_oa)); - if (saved_oa == NULL) { - CERROR("Can't save oa (ENOMEM)\n"); + saved_oa = obdo_alloc(); + if (saved_oa == NULL) RETURN(-ENOMEM); - } *saved_oa = *oa; } - + rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, pga); if (rc != 0) @@ -1218,7 +1216,7 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, } if (saved_oa != NULL) - OBD_FREE(saved_oa, sizeof(*saved_oa)); + obdo_free(saved_oa); RETURN(rc); } @@ -1374,12 +1372,11 @@ static void osc_occ_interrupted(struct oig_callback_context *occ) GOTO(unlock, 0); } - /* we don't get interruption callbacks until osc_trigger_sync_io() + /* we don't get interruption callbacks until osc_trigger_group_io() * has been called and put the sync oaps in the pending/urgent lists.*/ if (!list_empty(&oap->oap_pending_item)) { list_del_init(&oap->oap_pending_item); - if (oap->oap_async_flags & ASYNC_URGENT) - list_del_init(&oap->oap_urgent_item); + list_del_init(&oap->oap_urgent_item); loi = oap->oap_loi; lop = (oap->oap_cmd & OBD_BRW_WRITE) ? @@ -2262,7 +2259,8 @@ static void osc_group_to_pending(struct client_obd *cli, struct lov_oinfo *loi, oap = list_entry(pos, struct osc_async_page, oap_pending_item); list_del(&oap->oap_pending_item); list_add_tail(&oap->oap_pending_item, &lop->lop_pending); - list_add(&oap->oap_urgent_item, &lop->lop_urgent); + if (oap->oap_async_flags & ASYNC_URGENT) + list_add(&oap->oap_urgent_item, &lop->lop_urgent); lop_update_pending(cli, lop, cmd, 1); } loi_list_maint(cli, loi); @@ -2479,7 +2477,6 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *lsm, obd_count page_count, struct brw_page *pga) { - struct client_obd *cli = &exp->exp_obd->u.cli; struct ptlrpc_request *request = NULL; struct ost_body *body; struct niobuf_remote *nioptr; @@ -2494,7 +2491,7 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, request = ptlrpc_prep_req_pool(class_exp2cliimp(exp), LUSTRE_OST_VERSION, OST_SAN_WRITE, - 3, size, NULL, cli->cl_rq_pool); + 3, size, NULL, imp->imp_rq_pool); if (!request) RETURN(-ENOMEM); @@ -2692,8 +2689,8 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, goto no_match; /* Next, search for already existing extent locks that will cover us */ - rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode, - lockh); + rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy, + mode, lockh); if (rc == 1) { osc_set_data_with_check(lockh, data, *flags); if (*flags & LDLM_FL_HAS_INTENT) { @@ -2718,7 +2715,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, * locks out from other users right now, too. */ if (mode == LCK_PR) { - rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, + rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy, LCK_PW, lockh); if (rc == 1) { /* FIXME: This is not incredibly elegant, but it might @@ -2745,6 +2742,9 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, req->rq_replen = lustre_msg_size(2, size); } + /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type, policy, mode, flags, bl_cb, cp_cb, gl_cb, data, &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh); @@ -3065,14 +3065,40 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, RETURN(-EINVAL); } -static int osc_set_info(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val) +static int osc_setinfo_mds_conn_interpret(struct ptlrpc_request *req, + void *aa, int rc) +{ + struct llog_ctxt *ctxt; + struct obd_import *imp = req->rq_import; + ENTRY; + + if (rc != 0) + RETURN(rc); + + ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT); + if (ctxt) { + if (rc == 0) + rc = llog_initiator_connect(ctxt); + else + CERROR("cannot establish connection for " + "ctxt %p: %d\n", ctxt, rc); + } + + imp->imp_server_timeout = 1; + CDEBUG(D_HA, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd)); + imp->imp_pingable = 1; + + RETURN(rc); +} + +static int osc_set_info_async(struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) { struct ptlrpc_request *req; struct obd_device *obd = exp->exp_obd; struct obd_import *imp = class_exp2cliimp(exp); - struct llog_ctxt *ctxt; - int rc, size[2] = {keylen, vallen}; + int size[2] = {keylen, vallen}; char *bufs[2] = {key, val}; ENTRY; @@ -3088,7 +3114,7 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, RETURN(0); } - + if (KEY_IS("unlinked")) { struct osc_creator *oscc = &obd->u.cli.cl_oscc; spin_lock(&oscc->oscc_lock); @@ -3098,7 +3124,6 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, } if (KEY_IS("initial_recov")) { - struct obd_import *imp = exp->exp_obd->u.cli.cl_import; if (vallen != sizeof(int)) RETURN(-EINVAL); imp->imp_initial_recov = *(int *)val; @@ -3115,9 +3140,15 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, RETURN(0); } - if (!KEY_IS("mds_conn") && !KEY_IS("evict_by_nid")) + if (!set) RETURN(-EINVAL); + /* We pass all other commands directly to OST. Since nobody calls osc + methods directly and everybody is supposed to go through LOV, we + assume lov checked invalid values for us. + The only recognised values so far are evict_by_nid and mds_conn. + Even if something bad goes through, we'd get a -EINVAL from OST + anyway. */ req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_SET_INFO, 2, size, bufs); @@ -3125,23 +3156,13 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, RETURN(-ENOMEM); req->rq_replen = lustre_msg_size(0, NULL); - rc = ptlrpc_queue_wait(req); - ptlrpc_req_finished(req); - ctxt = llog_get_context(exp->exp_obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) { - if (rc == 0) - rc = llog_initiator_connect(ctxt); - else - CERROR("cannot establish connection for ctxt %p: %d\n", - ctxt, rc); - } - - imp->imp_server_timeout = 1; - CDEBUG(D_HA, "pinging OST %s\n", imp->imp_target_uuid.uuid); - imp->imp_pingable = 1; + if (KEY_IS("mds_conn")) + req->rq_interpret_reply = osc_setinfo_mds_conn_interpret; + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(set); - RETURN(rc); + RETURN(0); } @@ -3310,6 +3331,7 @@ static int osc_import_event(struct obd_device *obd, int osc_setup(struct obd_device *obd, obd_count len, void *buf) { int rc; + ENTRY; ENTRY; rc = ptlrpcd_addref(); @@ -3335,15 +3357,16 @@ int osc_setup(struct obd_device *obd, obd_count len, void *buf) previous ones. Ideally we want to have 2x max_rpcs_in_flight reserved, but I afraid that might be too much wasted RAM in fact, so 2 is just my guess and still should work. */ - cli->cl_rq_pool = ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, - OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); + cli->cl_import->imp_rq_pool = + ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, + OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); } RETURN(rc); } -static int osc_precleanup(struct obd_device *obd, int stage) +static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) { int rc = 0; ENTRY; @@ -3357,10 +3380,15 @@ static int osc_precleanup(struct obd_device *obd, int stage) ptlrpc_deactivate_import(imp); break; } + case OBD_CLEANUP_EXPORTS: + break; case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); + break; + case OBD_CLEANUP_OBD: + break; } RETURN(rc); } @@ -3368,7 +3396,6 @@ static int osc_precleanup(struct obd_device *obd, int stage) int osc_cleanup(struct obd_device *obd) { struct osc_creator *oscc = &obd->u.cli.cl_oscc; - struct client_obd *cli = &obd->u.cli; int rc; ENTRY; @@ -3385,8 +3412,6 @@ int osc_cleanup(struct obd_device *obd) rc = client_obd_cleanup(obd); - ptlrpc_free_rq_pool(cli->cl_rq_pool); - ptlrpcd_decref(); RETURN(rc); } @@ -3429,7 +3454,7 @@ struct obd_ops osc_obd_ops = { .o_join_lru = osc_join_lru, .o_iocontrol = osc_iocontrol, .o_get_info = osc_get_info, - .o_set_info = osc_set_info, + .o_set_info_async = osc_set_info_async, .o_import_event = osc_import_event, .o_llog_init = osc_llog_init, .o_llog_finish = osc_llog_finish, @@ -3438,7 +3463,9 @@ struct obd_ops osc_obd_ops = { #if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct obd_ops sanosc_obd_ops = { .o_owner = THIS_MODULE, - .o_cleanup = client_obd_cleanup, + .o_setup = client_sanobd_setup, + .o_precleanup = osc_precleanup, + .o_cleanup = osc_cleanup, .o_add_conn = client_import_add_conn, .o_del_conn = client_import_del_conn, .o_connect = client_connect_import, @@ -3452,7 +3479,6 @@ struct obd_ops sanosc_obd_ops = { .o_getattr = osc_getattr, .o_getattr_async = osc_getattr_async, .o_setattr = osc_setattr, - .o_setup = client_sanobd_setup, .o_brw = sanosc_brw, .o_punch = osc_punch, .o_sync = osc_sync, @@ -3469,7 +3495,6 @@ struct obd_ops sanosc_obd_ops = { }; #endif -static quota_interface_t *quota_interface; extern quota_interface_t osc_quota_interface; int __init osc_init(void) diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 562eb09..2cc87af 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1146,7 +1146,7 @@ static int ost_set_info(struct obd_export *exp, struct ptlrpc_request *req) GOTO(out, rc = 0); } - rc = obd_set_info(exp, keylen, key, vallen, val); + rc = obd_set_info_async(exp, keylen, key, vallen, val, NULL); out: req->rq_repmsg->status = 0; RETURN(rc); @@ -1677,6 +1677,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) if (rc) GOTO(out_io, rc = -EINVAL); + ping_evictor_start(); + RETURN(0); out_io: @@ -1699,6 +1701,8 @@ static int ost_cleanup(struct obd_device *obd) int err = 0; ENTRY; + ping_evictor_stop(); + spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovering) { target_cancel_recovery_timer(obd); diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am index 9fd2781..3540330 100644 --- a/lustre/ptlrpc/autoMakefile.am +++ b/lustre/ptlrpc/autoMakefile.am @@ -24,7 +24,7 @@ if LIBLUSTRE noinst_LIBRARIES = libptlrpc.a libptlrpc_a_SOURCES = $(COMMON_SOURCES) -libptlrpc_a_CPPFLAGS = $(LLCPPFLGS) +libptlrpc_a_CPPFLAGS = $(LLCPPFLAGS) libptlrpc_a_CFLAGS = $(LLCFLAGS) endif @@ -71,6 +71,5 @@ endif # DARWIN endif # MODULES install-data-hook: $(install_data_hook) - -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index feb77f2..2732e53 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -464,6 +464,7 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set, list_add_tail(&req->rq_set_chain, &set->set_requests); req->rq_set = set; set->set_remaining++; + atomic_inc(&req->rq_import->imp_inflight); } @@ -652,9 +653,6 @@ static int after_reply(struct ptlrpc_request *req) spin_lock_irqsave(&imp->imp_lock, flags); } - if (req->rq_transno > imp->imp_max_transno) - imp->imp_max_transno = req->rq_transno; - /* Replay-enabled imports return commit-status information. */ if (req->rq_repmsg->last_committed) imp->imp_peer_committed_transno = @@ -987,7 +985,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) RETURN(1); } - ptlrpc_fail_import(imp, req->rq_import_generation); + ptlrpc_fail_import(imp, req->rq_reqmsg->conn_cnt); RETURN(0); } @@ -1093,7 +1091,9 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set) int rc, timeout; ENTRY; - LASSERT(!list_empty(&set->set_requests)); + if (list_empty(&set->set_requests)) + RETURN(0); + list_for_each(tmp, &set->set_requests) { req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); if (req->rq_phase == RQ_PHASE_NEW) @@ -1309,8 +1309,19 @@ void ptlrpc_free_committed(struct obd_import *imp) LASSERT_SPIN_LOCKED(&imp->imp_lock); - CDEBUG(D_HA, "%s: committing for last_committed "LPU64"\n", - imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + + if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && + imp->imp_generation == imp->imp_last_generation_checked) { + CDEBUG(D_HA, "%s: skip recheck for last_committed "LPU64"\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + return; + } + + CDEBUG(D_HA, "%s: committing for last_committed "LPU64" gen %d\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno, + imp->imp_generation); + imp->imp_last_transno_checked = imp->imp_peer_committed_transno; + imp->imp_last_generation_checked = imp->imp_generation; list_for_each_safe(tmp, saved, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index e12523e..cb657df 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -59,12 +59,14 @@ void request_out_callback(lnet_event_t *ev) spin_lock_irqsave(&req->rq_lock, flags); req->rq_net_err = 1; spin_unlock_irqrestore(&req->rq_lock, flags); - + ptlrpc_wake_client_req(req); } - /* this balances the atomic_inc in ptl_send_rpc() */ + /* these balance the references in ptl_send_rpc() */ + atomic_dec(&req->rq_import->imp_inflight); ptlrpc_req_finished(req); + EXIT; } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 5ddfbbc..715f65b 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -49,7 +49,7 @@ struct ptlrpc_connect_async_args { do { \ if (imp->imp_state != LUSTRE_IMP_CLOSED) { \ CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \ - imp, imp->imp_target_uuid.uuid, \ + imp, obd2cli_tgt(imp->imp_obd), \ ptlrpc_import_state_name(imp->imp_state), \ ptlrpc_import_state_name(state)); \ imp->imp_state = state; \ @@ -107,19 +107,27 @@ static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uu /* Returns true if import was FULL, false if import was already not * connected. + * @imp - import to be disconnected + * @conn_cnt - connection count (epoch) of the request that timed out + * and caused the disconnection. In some cases, multiple + * inflight requests can fail to a single target (e.g. OST + * bulk requests) and if one has already caused a reconnection + * (increasing the import->conn_cnt) the older failure should + * not also cause a reconnection. If zero it forces a reconnect. */ -int ptlrpc_set_import_discon(struct obd_import *imp) +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) { unsigned long flags; int rc = 0; spin_lock_irqsave(&imp->imp_lock, flags); - if (imp->imp_state == LUSTRE_IMP_FULL) { + if (imp->imp_state == LUSTRE_IMP_FULL && + (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { char *target_start; int target_len; - deuuidify(imp->imp_target_uuid.uuid, NULL, + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, &target_start, &target_len); LCONSOLE_ERROR("%s: Connection to service %.*s via nid %s was " @@ -130,18 +138,22 @@ int ptlrpc_set_import_discon(struct obd_import *imp) imp->imp_replayable ? "wait for recovery to complete" : "fail"); + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); + spin_unlock_irqrestore(&imp->imp_lock, flags); + if (obd_dump_on_timeout) libcfs_debug_dumplog(); - IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); - spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); rc = 1; } else { spin_unlock_irqrestore(&imp->imp_lock, flags); - CDEBUG(D_HA, "%p %s: import already not connected: %s\n", - imp,imp->imp_client->cli_name, - ptlrpc_import_state_name(imp->imp_state)); + CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", + imp->imp_client->cli_name, imp, + (imp->imp_state == LUSTRE_IMP_FULL && + imp->imp_conn_cnt > conn_cnt) ? + "reconnected" : "not connected", imp->imp_conn_cnt, + conn_cnt, ptlrpc_import_state_name(imp->imp_state)); } return rc; @@ -157,7 +169,7 @@ void ptlrpc_deactivate_import(struct obd_import *imp) ENTRY; spin_lock_irqsave(&imp->imp_lock, flags); - CDEBUG(D_HA, "setting import %s INVALID\n", imp->imp_target_uuid.uuid); + CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); imp->imp_invalid = 1; imp->imp_generation++; spin_unlock_irqrestore(&imp->imp_lock, flags); @@ -191,7 +203,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) if (rc) CERROR("%s: rc = %d waiting for callback (%d != 0)\n", - imp->imp_target_uuid.uuid, rc, + obd2cli_tgt(imp->imp_obd), rc, atomic_read(&imp->imp_inflight)); obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); @@ -209,26 +221,26 @@ void ptlrpc_activate_import(struct obd_import *imp) obd_import_event(obd, imp, IMP_EVENT_ACTIVE); } -void ptlrpc_fail_import(struct obd_import *imp, int generation) +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) { ENTRY; - LASSERT (!imp->imp_dlm_fake); + LASSERT(!imp->imp_dlm_fake); - if (ptlrpc_set_import_discon(imp)) { + if (ptlrpc_set_import_discon(imp, conn_cnt)) { unsigned long flags; if (!imp->imp_replayable) { CDEBUG(D_HA, "import %s@%s for %s not replayable, " "auto-deactivating\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid, imp->imp_obd->obd_name); ptlrpc_deactivate_import(imp); } CDEBUG(D_HA, "%s: waking up pinger\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_force_verify = 1; @@ -294,11 +306,11 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) int rc; __u64 committed_before_reconnect = 0; struct ptlrpc_request *request; - int size[] = {sizeof(imp->imp_target_uuid), + int size[] = {sizeof(imp->imp_obd->u.cli.cl_target_uuid), sizeof(obd->obd_uuid), sizeof(imp->imp_dlm_handle), sizeof(imp->imp_connect_data)}; - char *tmp[] = {imp->imp_target_uuid.uuid, + char *tmp[] = {obd2cli_tgt(imp->imp_obd), obd->obd_uuid.uuid, (char *)&imp->imp_dlm_handle, (char *)&imp->imp_connect_data}; @@ -350,12 +362,12 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) /* last in list */ (imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) { CDEBUG(D_HA, "Last connection attempt (%d) for %s\n", - imp->imp_conn_cnt, imp->imp_target_uuid.uuid); + imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd)); /* Don't retry if connect fails */ rc = 0; - obd_set_info(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", - sizeof(rc), &rc); + obd_set_info_async(obd->obd_self_export, + strlen("initial_recov"), "initial_recov", + sizeof(rc), &rc, NULL); } rc = obd_reconnect(imp->imp_obd->obd_self_export, obd, @@ -463,7 +475,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, if (aa->pcaa_initial_connect) { if (msg_flags & MSG_CONNECT_REPLAYABLE) { CDEBUG(D_HA, "connected to replayable target: %s\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); imp->imp_replayable = 1; } else { imp->imp_replayable = 0; @@ -480,7 +492,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, if (!memcmp(&old_hdl, &request->rq_repmsg->handle, sizeof (old_hdl))) { CERROR("%s@%s didn't like our handle "LPX64 - ", failed\n", imp->imp_target_uuid.uuid, + ", failed\n", obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid, imp->imp_dlm_handle.cookie); GOTO(out, rc = -ENOTCONN); @@ -490,14 +502,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, sizeof(imp->imp_remote_handle))) { CERROR("%s@%s changed handle from "LPX64" to "LPX64 "; copying, but this may foreshadow disaster\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid, imp->imp_remote_handle.cookie, request->rq_repmsg->handle.cookie); imp->imp_remote_handle = request->rq_repmsg->handle; } else { CDEBUG(D_HA, "reconnected to %s@%s after partition\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); } @@ -506,7 +518,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, } else if (MSG_CONNECT_RECOVERING & msg_flags) { CDEBUG(D_HA, "%s: reconnected to %s during replay\n", imp->imp_obd->obd_name, - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); imp->imp_resend_replay = 1; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { @@ -533,7 +545,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, " was previously committed, server now claims "LPD64 ")! See https://bugzilla.clusterfs.com/" "long_list.cgi?buglist=9646\n", - imp->imp_target_uuid.uuid, aa->pcaa_peer_committed, + obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, request->rq_repmsg->last_committed); } @@ -543,7 +555,7 @@ finish: if (rc == -ENOTCONN) { CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;" "invalidating and reconnecting\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); ptlrpc_connect_import(imp, NULL); RETURN(0); @@ -594,7 +606,7 @@ finish: CWARN("Server %s version (%d.%d.%d.%d) is much newer. " "Consider %s (%s).\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), OBD_OCD_VERSION_MAJOR(ocd->ocd_version), OBD_OCD_VERSION_MINOR(ocd->ocd_version), OBD_OCD_VERSION_PATCH(ocd->ocd_version), @@ -639,7 +651,7 @@ finish: "refused connection from this client " "as too old version (%s). Client must " "be recompiled\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), OBD_OCD_VERSION_MAJOR(ocd->ocd_version), OBD_OCD_VERSION_MINOR(ocd->ocd_version), OBD_OCD_VERSION_PATCH(ocd->ocd_version), @@ -654,7 +666,7 @@ finish: ptlrpc_maybe_ping_import_soon(imp); CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), (char *)imp->imp_connection->c_remote_uuid.uuid, rc); } @@ -711,15 +723,10 @@ static int ptlrpc_invalidate_import_thread(void *data) ENTRY; - lock_kernel(); - ptlrpc_daemonize(); - - cfs_block_allsigs(); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "ll_imp_inval"); - unlock_kernel(); - + ptlrpc_daemonize("ll_imp_inval"); + CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", - imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); ptlrpc_invalidate_import(imp); @@ -740,13 +747,13 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) ENTRY; if (imp->imp_state == LUSTRE_IMP_EVICTED) { - deuuidify(imp->imp_target_uuid.uuid, NULL, + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, &target_start, &target_len); LCONSOLE_ERROR("This client was evicted by %.*s; in progress " "operations using this service will fail.\n", target_len, target_start); CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); #ifdef __KERNEL__ @@ -766,7 +773,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) if (imp->imp_state == LUSTRE_IMP_REPLAY) { CDEBUG(D_HA, "replay requested by %s\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); rc = ptlrpc_replay_next(imp, &inflight); if (inflight == 0 && atomic_read(&imp->imp_replay_inflight) == 0) { @@ -796,7 +803,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) if (imp->imp_state == LUSTRE_IMP_RECOVER) { CDEBUG(D_HA, "reconnected to %s@%s\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); rc = ptlrpc_resend(imp); @@ -805,7 +812,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); - deuuidify(imp->imp_target_uuid.uuid, NULL, + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, &target_start, &target_len); LCONSOLE_INFO("%s: Connection restored to service %.*s " "using nid %s.\n", imp->imp_obd->obd_name, @@ -840,7 +847,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp) case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break; default: CERROR("don't know how to disconnect from %s (connect_op %d)\n", - imp->imp_target_uuid.uuid, imp->imp_connect_op); + obd2cli_tgt(imp->imp_obd), imp->imp_connect_op); RETURN(-EINVAL); } diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 3e9d76c..b05c5a3 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -479,13 +479,17 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) request->rq_reply_portal); } - ptlrpc_request_addref(request); /* +1 ref for the SENT callback */ + /* add references on request and import for request_out_callback */ + ptlrpc_request_addref(request); + atomic_inc(&request->rq_import->imp_inflight); + + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); request->rq_sent = CURRENT_SECONDS; ptlrpc_pinger_sending_on_import(request->rq_import); - rc = ptl_send_buf(&request->rq_req_md_h, + rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqmsg, request->rq_reqlen, - LNET_NOACK_REQ, &request->rq_req_cbid, + LNET_NOACK_REQ, &request->rq_req_cbid, connection, request->rq_request_portal, request->rq_xid); @@ -494,7 +498,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) RETURN(rc); } - ptlrpc_req_finished (request); /* drop callback ref */ + /* drop request_out_callback refs, we couldn't start the send */ + atomic_dec(&request->rq_import->imp_inflight); + ptlrpc_req_finished (request); if (noreply) RETURN(rc); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 820fb0b..ba80326 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -1783,8 +1783,8 @@ void lustre_assert_wire_constants(void) (long long)FMODE_READ); LASSERTF(FMODE_WRITE == 2, " found %lld\n", (long long)FMODE_WRITE); - LASSERTF(FMODE_EXEC == 4, " found %lld\n", - (long long)FMODE_EXEC); + LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n", + (long long)MDS_FMODE_EXEC); CLASSERT(MDS_OPEN_CREAT == 00000100); CLASSERT(MDS_OPEN_EXCL == 00000200); CLASSERT(MDS_OPEN_TRUNC == 00001000); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 33a18ce..db5eb7c 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -49,14 +49,14 @@ int ptlrpc_ping(struct obd_import *imp) if (req) { DEBUG_REQ(D_INFO, req, "pinging %s->%s", imp->imp_obd->obd_uuid.uuid, - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); req->rq_no_resend = req->rq_no_delay = 1; req->rq_replen = lustre_msg_size(0, NULL); ptlrpcd_add_req(req); } else { CERROR("OOM trying to ping %s->%s\n", imp->imp_obd->obd_uuid.uuid, - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); rc = -ENOMEM; } @@ -82,16 +82,7 @@ static int ptlrpc_pinger_main(void *arg) struct ptlrpc_thread *thread = data->thread; ENTRY; - lock_kernel(); - ptlrpc_daemonize(); - - cfs_block_allsigs(); - - LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX, - "name %d > len %d\n", - (int)strlen(data->name), CFS_CURPROC_COMM_MAX); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name); - unlock_kernel(); + cfs_daemonize(data->name); /* Record that the thread is running */ thread->t_flags = SVC_RUNNING; @@ -140,7 +131,7 @@ static int ptlrpc_pinger_main(void *arg) CDEBUG(D_HA, "not pinging %s " "(in recovery: %s or recovery " "disabled: %u/%u)\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), ptlrpc_import_state_name(level), imp->imp_deactive, imp->imp_obd->obd_no_recov); @@ -153,7 +144,7 @@ static int ptlrpc_pinger_main(void *arg) CDEBUG(D_INFO, "don't need to ping %s ("CFS_TIME_T " > "CFS_TIME_T")\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_next_ping, this_ping); } @@ -169,7 +160,7 @@ static int ptlrpc_pinger_main(void *arg) time_to_next_ping = cfs_time_sub(cfs_time_add(this_ping, cfs_time_seconds(PING_INTERVAL)), cfs_time_current()); - + /* The ping sent by ptlrpc_send_rpc may get sent out say .01 second after this. ptlrpc_pinger_eending_on_import will then set the @@ -278,7 +269,7 @@ int ptlrpc_pinger_add_import(struct obd_import *imp) mutex_down(&pinger_sem); CDEBUG(D_HA, "adding pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); ptlrpc_update_next_ping(imp); /* XXX sort, blah blah */ list_add_tail(&imp->imp_pinger_chain, &pinger_imports); @@ -299,7 +290,7 @@ int ptlrpc_pinger_del_import(struct obd_import *imp) mutex_down(&pinger_sem); list_del_init(&imp->imp_pinger_chain); CDEBUG(D_HA, "removing pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); class_import_put(imp); mutex_up(&pinger_sem); RETURN(0); @@ -313,6 +304,137 @@ void ptlrpc_pinger_wake_up() #endif } +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 + +static int pet_refcount = 0; +static int pet_state; +static wait_queue_head_t pet_waitq; +static struct obd_export *pet_exp = NULL; +static spinlock_t pet_lock = SPIN_LOCK_UNLOCKED; + +int ping_evictor_wake(struct obd_export *exp) +{ + spin_lock(&pet_lock); + if (pet_exp) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + + /* We have to make sure the obd isn't destroyed between now and when + * the ping evictor runs. We'll take a reference here, and drop it + * when we finish in the evictor. We don't really care about this + * export in particular; we just need one to keep the obd alive. */ + pet_exp = class_export_get(exp); + spin_unlock(&pet_lock); + + wake_up(&pet_waitq); + return 0; +} + +static int ping_evictor_main(void *arg) +{ + struct obd_device *obd; + struct obd_export *exp; + struct l_wait_info lwi = { 0 }; + time_t expire_time; + ENTRY; + + ptlrpc_daemonize("ping_evictor"); + + CDEBUG(D_HA, "Starting Ping Evictor\n"); + pet_exp = NULL; + pet_state = PET_READY; + while (1) { + l_wait_event(pet_waitq, pet_exp || + (pet_state == PET_TERMINATE), &lwi); + if (pet_state == PET_TERMINATE) + break; + + /* we only get here if pet_exp != NULL, and the end of this + * loop is the only place which sets it NULL again, so lock + * is not strictly necessary. */ + spin_lock(&pet_lock); + obd = pet_exp->exp_obd; + spin_unlock(&pet_lock); + + expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2); + + CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", + obd->obd_name, expire_time); + + /* Exports can't be deleted out of the list while we hold + * the obd lock (class_unlink_export), which means we can't + * lose the last ref on the export. If they've already been + * removed from the list, we won't find them here. */ + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_exports_timed)) { + exp = list_entry(obd->obd_exports_timed.next, + struct obd_export,exp_obd_chain_timed); + + if (expire_time > exp->exp_last_request_time) { + class_export_get(exp); + spin_unlock(&obd->obd_dev_lock); + LCONSOLE_WARN("%s: haven't heard from %s in %ld" + " seconds. Last request was at %ld. " + "I think it's dead, and I am evicting " + "it.\n", obd->obd_name, + obd_export_nid2str(exp), + (long)(CURRENT_SECONDS - + exp->exp_last_request_time), + exp->exp_last_request_time); + + + class_fail_export(exp); + class_export_put(exp); + + spin_lock(&obd->obd_dev_lock); + } else { + /* List is sorted, so everyone below is ok */ + break; + } + } + spin_unlock(&obd->obd_dev_lock); + + class_export_put(pet_exp); + + spin_lock(&pet_lock); + pet_exp = NULL; + spin_unlock(&pet_lock); + } + CDEBUG(D_HA, "Exiting Ping Evictor\n"); + + RETURN(0); +} + +void ping_evictor_start(void) +{ + int rc; + + if (++pet_refcount > 1) + return; + + init_waitqueue_head(&pet_waitq); + + rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS); + if (rc < 0) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %d\n", rc); + } +} +EXPORT_SYMBOL(ping_evictor_start); + +void ping_evictor_stop(void) +{ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +} +EXPORT_SYMBOL(ping_evictor_stop); #else /* !__KERNEL__ */ /* XXX @@ -382,7 +504,7 @@ static int pinger_check_rpcs(void *arg) if (level != LUSTRE_IMP_FULL) { CDEBUG(D_HA, "not pinging %s (in recovery)\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); continue; } @@ -400,7 +522,7 @@ static int pinger_check_rpcs(void *arg) ptlrpc_set_add_req(set, req); } else { CDEBUG(D_HA, "don't need to ping %s ("CFS_TIME_T" > " - CFS_TIME_T")\n", imp->imp_target_uuid.uuid, + CFS_TIME_T")\n", obd2cli_tgt(imp->imp_obd), imp->imp_next_ping, pd->pd_this_ping); } } @@ -417,7 +539,7 @@ static int pinger_check_rpcs(void *arg) rq_set_chain); DEBUG_REQ(D_HA, req, "pinging %s->%s", req->rq_import->imp_obd->obd_uuid.uuid, - req->rq_import->imp_target_uuid.uuid); + obd2cli_tgt(req->rq_import->imp_obd)); (void)ptl_send_rpc(req, 0); } @@ -512,7 +634,7 @@ int ptlrpc_pinger_add_import(struct obd_import *imp) RETURN(-EALREADY); CDEBUG(D_HA, "adding pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); ptlrpc_pinger_sending_on_import(imp); mutex_down(&pinger_sem); @@ -532,7 +654,7 @@ int ptlrpc_pinger_del_import(struct obd_import *imp) mutex_down(&pinger_sem); list_del_init(&imp->imp_pinger_chain); CDEBUG(D_HA, "removing pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); class_import_put(imp); mutex_up(&pinger_sem); RETURN(0); diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 2cd63ab..e45106d 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -38,7 +38,7 @@ struct ptlrpc_request_set; void ptlrpc_request_handle_notconn(struct ptlrpc_request *); void lustre_assert_wire_constants(void); int ptlrpc_import_in_recovery(struct obd_import *imp); -int ptlrpc_set_import_discon(struct obd_import *imp); +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt); void ptlrpc_handle_failed_import(struct obd_import *imp); int ptlrpc_replay_next(struct obd_import *imp, int *inflight); void ptlrpc_initiate_recovery(struct obd_import *imp); @@ -56,7 +56,7 @@ void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req, #define ptlrpc_lprocfs_unregister_service(params...) do{}while(0) #define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0) #define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0) -#endif /* __KERNEL__ */ +#endif /* LPROCFS */ /* recovd_thread.c */ int llog_init_commit_master(void); @@ -114,5 +114,10 @@ int ptlrpc_stop_pinger(void); void ptlrpc_pinger_sending_on_import(struct obd_import *imp); void ptlrpc_pinger_wake_up(void); void ptlrpc_ping_import_soon(struct obd_import *imp); +#ifdef __KERNEL__ +int ping_evictor_wake(struct obd_export *exp); +#else +#define ping_evictor_wake(exp) 1 +#endif #endif /* PTLRPC_INTERNAL_H */ diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index ae50b2f..a3df637 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -70,7 +70,6 @@ __init int ptlrpc_init(void) cleanup_phase = 2; ptlrpc_put_connection_superhack = ptlrpc_put_connection; - ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight; rc = ptlrpc_start_pinger(); if (rc) diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 603cd6b..fa315eb 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -145,7 +145,6 @@ static int ptlrpcd(void *arg) ENTRY; cfs_daemonize(pc->pc_name); - cfs_block_allsigs(); complete(&pc->pc_starting); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 2a1164c..8ad20c6 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -227,17 +227,13 @@ static int log_commit_thread(void *arg) if (lcd == NULL) RETURN(-ENOMEM); - lock_kernel(); - ptlrpc_daemonize(); /* thread never needs to do IO */ - - cfs_block_allsigs(); - spin_lock(&lcm->lcm_thread_lock); THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total)); atomic_inc(&lcm->lcm_thread_total); spin_unlock(&lcm->lcm_thread_lock); - unlock_kernel(); + + ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */ CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list); CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list); @@ -344,7 +340,8 @@ static int log_commit_thread(void *arg) } mutex_up(&llcd->llcd_ctxt->loc_sem); - if (!import || (import == LP_POISON)) { + if (!import || (import == LP_POISON) || + (import->imp_client == LP_POISON)) { CERROR("No import %p (llcd=%p, ctxt=%p)\n", import, llcd, llcd->llcd_ctxt); llcd_put(llcd); @@ -501,12 +498,7 @@ static int log_process_thread(void *args) ENTRY; mutex_up(&data->llpa_sem); - lock_kernel(); - ptlrpc_daemonize(); /* thread does IO to log files */ - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "llog_process"); - - cfs_block_allsigs(); - unlock_kernel(); + ptlrpc_daemonize("llog_process"); /* thread does IO to log files */ rc = llog_create(ctxt, &llh, &logid, NULL); if (rc) { diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 5f6edfa..4d41dc0 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -94,7 +94,7 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp) argv[0] = obd_lustre_upcall; argv[1] = "FAILED_IMPORT"; - argv[2] = imp->imp_target_uuid.uuid; + argv[2] = obd2cli_tgt(imp->imp_obd); argv[3] = imp->imp_obd->obd_name; argv[4] = imp->imp_connection->c_remote_uuid.uuid; argv[5] = imp->imp_obd->obd_uuid.uuid; @@ -132,14 +132,14 @@ void ptlrpc_initiate_recovery(struct obd_import *imp) if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) { CDEBUG(D_HA, "%s: starting recovery without upcall\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); ptlrpc_connect_import(imp, NULL); } else if (strcmp(obd_lustre_upcall, "NONE") == 0) { CDEBUG(D_HA, "%s: recovery disabled\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); } else { CDEBUG(D_HA, "%s: calling upcall to start recovery\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); ptlrpc_run_failed_import_upcall(imp); } @@ -161,13 +161,14 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) * get rid of them now. */ spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_last_transno_checked = 0; ptlrpc_free_committed(imp); last_transno = imp->imp_last_replay_transno; spin_unlock_irqrestore(&imp->imp_lock, flags); CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n", - imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno, - last_transno); + imp, obd2cli_tgt(imp->imp_obd), + imp->imp_peer_committed_transno, last_transno); /* Do I need to hold a lock across this iteration? We shouldn't be * racing with any additions to the list, because we're in recovery @@ -273,15 +274,14 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) ENTRY; CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", - imp->imp_obd->obd_name, - imp->imp_target_uuid.uuid, + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); - if (ptlrpc_set_import_discon(imp)) { + if (ptlrpc_set_import_discon(imp, failed_req->rq_reqmsg->conn_cnt)) { if (!imp->imp_replayable) { CDEBUG(D_HA, "import %s@%s for %s not replayable, " "auto-deactivating\n", - imp->imp_target_uuid.uuid, + obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid, imp->imp_obd->obd_name); ptlrpc_deactivate_import(imp); @@ -317,7 +317,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) * requests. */ if (!active) { CWARN("setting import %s INACTIVE by administrator request\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); ptlrpc_invalidate_import(imp); imp->imp_deactive = 1; } @@ -326,7 +326,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) if (active) { imp->imp_deactive = 0; CDEBUG(D_HA, "setting import %s VALID\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); rc = ptlrpc_recover_import(imp, NULL); } @@ -339,7 +339,7 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) ENTRY; /* force import to be disconnected. */ - ptlrpc_set_import_discon(imp); + ptlrpc_set_import_discon(imp, 0); imp->imp_deactive = 0; rc = ptlrpc_recover_import_no_retry(imp, new_uuid); @@ -383,14 +383,14 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp, RETURN(rc); CDEBUG(D_HA, "%s: recovery started, waiting\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout)), NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); CDEBUG(D_HA, "%s: recovery finished\n", - imp->imp_target_uuid.uuid); + obd2cli_tgt(imp->imp_obd)); RETURN(rc); } diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index e258b20..631f096 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -203,13 +203,6 @@ ptlrpc_commit_replies (struct obd_device *obd) spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags); } -static long -timeval_sub(struct timeval *large, struct timeval *small) -{ - return (large->tv_sec - small->tv_sec) * 1000000 + - (large->tv_usec - small->tv_usec); -} - static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc) { @@ -430,6 +423,85 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) } +/* This function makes sure dead exports are evicted in a timely manner. + This function is only called when some export receives a message (i.e., + the network is up.) */ +static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) +{ + struct obd_export *oldest_exp; + time_t oldest_time; + + ENTRY; + + LASSERT(exp); + + /* Compensate for slow machines, etc, by faking our request time + into the future. Although this can break the strict time-ordering + of the list, we can be really lazy here - we don't have to evict + at the exact right moment. Eventually, all silent exports + will make it to the top of the list. */ + exp->exp_last_request_time = max(exp->exp_last_request_time, + (time_t)CURRENT_SECONDS + extra_delay); + + CDEBUG(D_INFO, "updating export %s at %ld\n", + exp->exp_client_uuid.uuid, + exp->exp_last_request_time); + + /* exports may get disconnected from the chain even though the + export has references, so we must keep the spin lock while + manipulating the lists */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + EXIT; + return; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + oldest_time = oldest_exp->exp_last_request_time; + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (exp->exp_obd->obd_recovering) { + /* be nice to everyone during recovery */ + EXIT; + return; + } + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + /* Check if the oldest entry is expired. */ + if (CURRENT_SECONDS > (oldest_time + + (3 * obd_timeout / 2) + extra_delay)) { + /* We need a second timer, in case the net was down and + * it just came back. Since the pinger may skip every + * other PING_INTERVAL (see note in ptlrpc_pinger_main), + * we better wait for 3. */ + exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + + 3 * PING_INTERVAL; + CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + oldest_time); + } + } else { + if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer + + extra_delay)) { + /* The evictor won't evict anyone who we've heard from + * recently, so we don't have to check before we start + * it. */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } + + EXIT; +} + static int ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct ptlrpc_thread *thread) @@ -464,7 +536,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, spin_unlock_irqrestore (&svc->srv_lock, flags); do_gettimeofday(&work_start); - timediff = timeval_sub(&work_start, &request->rq_arrival_time); + timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL); if (svc->srv_stats != NULL) { lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, timediff); @@ -519,8 +591,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, goto put_conn; } - class_update_export_timer(request->rq_export, - (time_t)(timediff / 500000)); + ptlrpc_update_export_timer(request->rq_export, timediff/500000); } /* Discard requests queued for longer than my timeout. If the @@ -567,15 +638,15 @@ put_conn: out: do_gettimeofday(&work_end); - timediff = timeval_sub(&work_end, &work_start); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); if (timediff / 1000000 > (long)obd_timeout) CERROR("request "LPU64" opc %u from %s processed in %lds " "trans "LPU64" rc %d/%d\n", request->rq_xid, request->rq_reqmsg->opc, libcfs_id2str(request->rq_peer), - timeval_sub(&work_end, - &request->rq_arrival_time) / 1000000, + cfs_timeval_sub(&work_end, &request->rq_arrival_time, + NULL) / 1000000, request->rq_repmsg ? request->rq_repmsg->transno : request->rq_transno, request->rq_status, request->rq_repmsg ? request->rq_repmsg->status : -999); @@ -584,7 +655,8 @@ put_conn: "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", request->rq_xid, request->rq_reqmsg->opc, libcfs_id2str(request->rq_peer), timediff, - timeval_sub(&work_end, &request->rq_arrival_time), + cfs_timeval_sub(&work_end, &request->rq_arrival_time, + NULL), request->rq_transno, request->rq_status, request->rq_repmsg ? request->rq_repmsg->status : -999); @@ -742,16 +814,15 @@ liblustre_check_services (void *arg) #else /* __KERNEL__ */ /* Don't use daemonize, it removes fs struct from new thread (bug 418) */ -void ptlrpc_daemonize(void) +void ptlrpc_daemonize(char *name) { - exit_mm(cfs_current()); - lustre_daemonize_helper(); -#if LINUX_ - /* XXX Liang: */ + struct fs_struct *fs = current->fs; + + atomic_inc(&fs->count); + cfs_daemonize(name); + exit_fs(cfs_current()); + current->fs = fs; set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd); -#endif - exit_files(cfs_current()); - reparent_to_init(); } static void @@ -796,16 +867,7 @@ static int ptlrpc_main(void *arg) int rc = 0; ENTRY; - lock_kernel(); - ptlrpc_daemonize(); - - cfs_block_allsigs(); - - LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX, - "name %d > len %d\n", - (int)strlen(data->name), CFS_CURPROC_COMM_MAX); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name); - unlock_kernel(); + ptlrpc_daemonize(data->name); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) && CONFIG_NUMA /* we need to do this before any per-thread allocation is done so that @@ -1189,7 +1251,7 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc) struct ptlrpc_request, rq_list); do_gettimeofday(&right_now); - timediff = timeval_sub(&right_now, &request->rq_arrival_time); + timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); cutoff = obd_health_check_timeout; diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index e47bd39..ea4f574 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -70,7 +70,6 @@ static int target_quotacheck_callback(struct obd_export *exp, static int target_quotacheck_thread(void *data) { - unsigned long flags; struct quotacheck_thread_args *qta = data; struct obd_export *exp; struct obd_device *obd; @@ -78,17 +77,7 @@ static int target_quotacheck_thread(void *data) struct lvfs_run_ctxt saved; int rc; - lock_kernel(); - ptlrpc_daemonize(); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", - "quotacheck"); - unlock_kernel(); + ptlrpc_daemonize("quotacheck"); exp = qta->qta_exp; obd = exp->exp_obd; @@ -211,7 +200,7 @@ int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk) if (rc == CL_NOT_QUOTACHECKED) rc = -EINTR; - qchk->obd_uuid = cli->cl_import->imp_target_uuid; + qchk->obd_uuid = cli->cl_target_uuid; if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME))) memcpy(qchk->obd_type, LUSTRE_FILTER_NAME, diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c index 8d8a4c5..013eead1 100644 --- a/lustre/quota/quota_context.c +++ b/lustre/quota/quota_context.c @@ -685,20 +685,11 @@ static int qslave_recovery_main(void *arg) struct qslave_recov_thread_data *data = arg; struct obd_device *obd = data->obd; struct lustre_quota_ctxt *qctxt = data->qctxt; - unsigned long flags; unsigned int type; int rc = 0; ENTRY; - lock_kernel(); - ptlrpc_daemonize(); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", "qslave_recovd"); - unlock_kernel(); + ptlrpc_daemonize("qslave_recovd"); complete(&data->comp); diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c index 9a4b5d0..a8c4317 100644 --- a/lustre/quota/quota_ctl.c +++ b/lustre/quota/quota_ctl.c @@ -91,6 +91,7 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) { struct obd_device *obd = exp->exp_obd; + struct obd_device_target *obt = &obd->u.obt; struct lvfs_run_ctxt saved; int rc = 0; ENTRY; @@ -98,6 +99,12 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) switch (oqctl->qc_cmd) { case Q_QUOTAON: case Q_QUOTAOFF: + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + rc = -EBUSY; + break; + } case Q_GETOINFO: case Q_GETOQUOTA: case Q_GETQUOTA: @@ -113,6 +120,9 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF) + atomic_inc(&obt->obt_quotachecking); break; case Q_INITQUOTA: { diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c index 92f39b3..3531a57 100644 --- a/lustre/quota/quota_interface.c +++ b/lustre/quota/quota_interface.c @@ -603,6 +603,8 @@ int osc_quota_exit(void) rc = cfs_mem_cache_destroy(qinfo_cachep); LASSERTF(rc == 0, "couldn't destory qinfo_cachep slab\n"); + qinfo_cachep = NULL; + RETURN(0); } diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c index 4d50dc3..7332669 100644 --- a/lustre/quota/quota_master.c +++ b/lustre/quota/quota_master.c @@ -396,10 +396,10 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) /* lookup quota file */ rc = 0; - down(&iparent->i_sem); + LOCK_INODE_MUTEX(iparent); de = lookup_one_len(quotafiles[i], dparent, strlen(quotafiles[i])); - up(&iparent->i_sem); + UNLOCK_INODE_MUTEX(iparent); if (IS_ERR(de) || de->d_inode == NULL || !S_ISREG(de->d_inode->i_mode)) rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT; @@ -528,10 +528,17 @@ static int mds_admin_quota_off(struct obd_device *obd, int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; + struct obd_device_target *obt = &obd->u.obt; struct lvfs_run_ctxt saved; int rc; ENTRY; + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + RETURN(-EBUSY); + } + down(&mds->mds_qonoff_sem); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = mds_admin_quota_on(obd, oqctl); @@ -546,16 +553,24 @@ int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) out: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); + atomic_inc(&obt->obt_quotachecking); RETURN(rc); } int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; + struct obd_device_target *obt = &obd->u.obt; struct lvfs_run_ctxt saved; int rc, rc2; ENTRY; + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + RETURN(-EBUSY); + } + down(&mds->mds_qonoff_sem); /* close admin quota files */ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -566,6 +581,8 @@ int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl) pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); + atomic_inc(&obt->obt_quotachecking); + RETURN(rc ?: rc2); } @@ -1018,21 +1035,11 @@ static int qmaster_recovery_main(void *arg) { struct qmaster_recov_thread_data *data = arg; struct obd_device *obd = data->obd; - unsigned long flags; int rc = 0; unsigned short type; ENTRY; - lock_kernel(); - ptlrpc_daemonize(); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", - "qmaster_recovd"); - unlock_kernel(); + ptlrpc_daemonize("qmaster_recovd"); complete(&data->comp); diff --git a/lustre/scripts/lustre b/lustre/scripts/lustre index 89edc5b..3b6b640a 100755 --- a/lustre/scripts/lustre +++ b/lustre/scripts/lustre @@ -46,25 +46,28 @@ LOCK=/var/lock/subsys/$SERVICE # Source function library. if [ -f /etc/init.d/functions ] ; then - . /etc/init.d/functions + . /etc/init.d/functions fi # Source networking configuration. if [ -f /etc/sysconfig/network ] ; then - . /etc/sysconfig/network + . /etc/sysconfig/network fi check_start_stop() { - # Check that networking is up. - [ "${NETWORKING}" = "no" ] && exit 0 + # Exit codes now LSB compliant + # Check that networking is up. - exit 'not running' + [ "${NETWORKING}" = "no" ] && exit 7 - [ -x ${LCONF} -a -x ${LCTL} ] || exit 0 + # exit 'not installed' + [ -x ${LCONF} -a -x ${LCTL} ] || exit 5 if [ ${LUSTRE_CONFIG_XML:0:1} = "/" ] ; then - if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then - echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping." - exit 0 - fi + if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then + echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping." + # exit 'not configured' + exit 6 + fi fi # Create /var/lustre directory @@ -77,7 +80,7 @@ check_start_stop() { start() { if [ -x "/usr/sbin/clustat" -a "${SERVICE}" = "lustre" ] ; then - if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then + if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then cat >&2 < /dev/null`" ] && STATE="running" && RETVAL=0 # check for any configured devices (may indicate partial startup) - [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=1 + [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=150 # check for either a server or a client filesystem MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`" @@ -159,14 +167,17 @@ status() { # check for error in health_check HEALTH="/proc/fs/lustre/health_check" - [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=2 + [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=151 # check for LBUG - [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=3 + [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152 - # Check if the service really exists - DUMMY=`lctl dl | grep $SERVICE` - [ $? -ne 0 ] && STATE="not_found" && RETVAL=5 + # If Lustre is up , check if the service really exists + # Skip this is we are not checking a specific service + if [ $RETVAL -eq 0 ] && [ $SERVICE != 'lustre' ]; then + DUMMY=`lctl dl | grep $SERVICE` + [ $? -ne 0 ] && STATE="not_found" && RETVAL=3 + fi echo $STATE } diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 66d48f6..c68419b 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -19,9 +19,10 @@ PTLDEBUG=${PTLDEBUG:-0x3f0400} SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} PDSH=${PDSH:-no_dsh} -MDSDEV=${MDSDEV:-$ROOT/tmp/mds1-`hostname`} +TMP=${TMP:-/tmp} +MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} MDSSIZE=${MDSSIZE:-100000} -OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`} +OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`} OSTSIZE=${OSTSIZE:-200000} FSTYPE=${FSTYPE:-ext3} TIMEOUT=${TIMEOUT:-20} diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 577aba3..37f33a4 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -92,7 +92,7 @@ setup() { } cleanup() { - umount_client $MOUNT || return 200 + umount_client $MOUNT $FORCE || return 200 stop_mds $FORCE || return 201 stop_ost $FORCE || return 202 # catch case where these return just fine, but modules are still not unloaded @@ -202,11 +202,22 @@ test_5() { # if all the modules have unloaded. umount $MOUNT & UMOUNT_PID=$! - sleep 2 + sleep 6 echo "killing umount" kill -TERM $UMOUNT_PID echo "waiting for umount to finish" wait $UMOUNT_PID + if grep " $MOUNT " /etc/mtab; then + echo "test 5: mtab after failed umount" + umount $MOUNT & + UMOUNT_PID=$! + sleep 2 + echo "killing umount" + kill -TERM $UMOUNT_PID + echo "waiting for umount to finish" + wait $UMOUNT_PID + grep " $MOUNT " /etc/mtab && echo "test 5: mtab after second umount" && return 11 + fi # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null @@ -224,8 +235,11 @@ test_5b() { start_ost [ -d $MOUNT ] || mkdir -p $MOUNT + grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before lconf" && return 9 $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT && exit 1 + grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before mount" && return 10 + llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/mds_svc/client_facet $MOUNT && return 1 + grep " $MOUNT " /etc/mtab && echo "test 5b: mtab after failed mount" && return 11 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null @@ -245,8 +259,11 @@ test_5c() { start_mds [ -d $MOUNT ] || mkdir -p $MOUNT + grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before lconf" && return 9 $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1 + grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before mount" && return 10 + llmount -vv -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/wrong_mds_svc/client_facet $MOUNT && return 1 + grep " $MOUNT " /etc/mtab && echo "test 5c: mtab after failed mount" && return 11 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null @@ -266,10 +283,13 @@ test_5d() { stop_ost --force [ -d $MOUNT ] || mkdir -p $MOUNT + grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before lconf" && return 9 $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`://mds_svc/client_facet $MOUNT || return 1 + grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before mount" && return 10 + llmount -vv -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`:/mds_svc/client_facet $MOUNT || return 1 umount_client $MOUNT || return 2 + grep " $MOUNT " /etc/mtab && echo "test 5d: mtab after unmount" && return 11 stop_mds || return 3 @@ -279,6 +299,26 @@ test_5d() { } run_test 5d "ost down, don't crash during mount attempt" +test_5e() { + start_ost + start_mds + sleep 5 # give MDS a chance to connect to OSTs before delaying requests + +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 + do_facet client "sysctl -w lustre.fail_loc=0x80000506" + grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10 + mount_client $MOUNT || echo "mount failed (not fatal)" + umount_client $MOUNT || return 2 + grep " $MOUNT " /etc/mtab && echo "test 5e: mtab after unmount" && return 11 + + stop_mds || return 3 + stop_ost || return 3 + + lsmod | grep -q lnet && return 4 + return 0 +} +run_test 5e "delayed connect, don't crash (bug 10268)" + test_6() { setup manual_umount_client @@ -324,8 +364,7 @@ test_9() { # check the result of lmc --ptldebug/subsystem start_ost start_mds - mount_client $MOUNT - CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug | sed -e 's/.* = //'`" + CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug|cut -d= -f2`" if [ "$CHECK_PTLDEBUG" ] && [ $CHECK_PTLDEBUG -eq 1 ]; then echo "lmc --debug success" else @@ -340,7 +379,6 @@ test_9() { echo "lmc --subsystem: want 2, have $CHECK_SUBSYS" return 1 fi - check_mount || return 41 cleanup || return $? # the new PTLDEBUG/SUBSYSTEM used for lconf --ptldebug/subsystem @@ -364,8 +402,6 @@ test_9() { echo "lconf --subsystem: want 20, have $CHECK_SUBSYS" return 1 fi - mount_client $MOUNT - check_mount || return 41 cleanup || return $? # resume the old configuration @@ -607,16 +643,18 @@ cleanup_15() { } test_15() { - start_ost - start_mds echo "mount lustre on ${MOUNT} with $MOUNTLUSTRE....." if [ -f "$MOUNTLUSTRE" ]; then echo "save $MOUNTLUSTRE to $MOUNTLUSTRE.sav" - mv $MOUNTLUSTRE $MOUNTLUSTRE.sav + mv $MOUNTLUSTRE $MOUNTLUSTRE.sav && trap cleanup_15 EXIT INT + if [ -f $MOUNTLUSTRE ]; then + echo "$MOUNTLUSTRE cannot be moved, skipping test" + return 0 + fi fi - [ -f "$MOUNTLUSTRE" ] && echo "can't move $MOUNTLUSTRE" && return 40 - trap cleanup_15 EXIT INT [ ! `cp $(which llmount) $MOUNTLUSTRE` ] || return $? + start_ost + start_mds do_facet client "mkdir -p $MOUNT 2> /dev/null" # load llite module on the client if it isn't in /lib/modules do_facet client "$LCONF --nosetup --node client_facet $XMLCONFIG" @@ -638,7 +676,7 @@ test_15() { run_test 15 "zconf-mount without /sbin/mount.lustre (should return error)" test_16() { - TMPMTPT="/mnt/conf16" + TMPMTPT="${MOUNT%/*}/conf16" if [ ! -f "$MDSDEV" ]; then echo "no $MDSDEV existing, so mount Lustre to create one" @@ -691,7 +729,7 @@ test_16() { run_test 16 "verify that lustre will correct the mode of OBJECTS/LOGS/PENDING" test_17() { - TMPMTPT="/mnt/conf17" + TMPMTPT="${MOUNT%/*}/conf17" if [ ! -f "$MDSDEV" ]; then echo "no $MDSDEV existing, so mount Lustre to create one" diff --git a/lustre/tests/directio.c b/lustre/tests/directio.c index 933c988..fb9c99b 100644 --- a/lustre/tests/directio.c +++ b/lustre/tests/directio.c @@ -53,8 +53,8 @@ int main(int argc, char **argv) return 1; } - if (argc == 6) - st.st_blksize = strtoul(argv[4], 0, 0); + if (argc >= 6) + st.st_blksize = strtoul(argv[5], 0, 0); else if (fstat64(fd, &st) < 0) { printf("Cannot stat %s: %s\n", argv[1], strerror(errno)); return 1; diff --git a/lustre/tests/opendevunlink.c b/lustre/tests/opendevunlink.c index 8250f96..9335eda 100644 --- a/lustre/tests/opendevunlink.c +++ b/lustre/tests/opendevunlink.c @@ -100,10 +100,14 @@ int main(int argc, char **argv) exit(1); } +#if 0 + /* We cannot do this any longer, we do not store open special nodes + * on MDS after unlink */ if (st1.st_mode != st2.st_mode) { // can we do this? fprintf(stderr, "fstat different value on %s and %s\n", dname1, dname2); exit(1); } +#endif fprintf(stderr, "Ok, everything goes well.\n"); return 0; diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 25a1bbd..25d613e 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -393,7 +393,7 @@ test_24() { # bug 2248 - eviction fails writeback but app doesn't see it } run_test 24 "fsync error (should return error)" -test_26() { # bug 5921 - evict dead exports +test_26() { # bug 5921 - evict dead exports by pinger # this test can only run from a client on a separate node. [ "`lsmod | grep obdfilter`" ] && \ echo "skipping test 26 (local OST)" && return @@ -419,6 +419,28 @@ test_26() { # bug 5921 - evict dead exports } run_test 26 "evict dead exports" +test_26b() { # bug 10140 - evict dead exports by pinger + zconf_mount `hostname` $MOUNT2 + MDS_FILE=/proc/fs/lustre/mds/mds_svc/num_exports + MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`" + OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports + OST_NEXP1="`do_facet ost cat $OST_FILE | cut -d' ' -f2`" + echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports + zconf_umount `hostname` $MOUNT2 -f + # evictor takes up to 2.25x to evict. But if there's a + # race to start the evictor from various obds, the loser + # might have to wait for the next ping. + echo Waiting for $(($TIMEOUT * 4)) secs + sleep $(($TIMEOUT * 4)) + OST_NEXP2="`do_facet ost cat $OST_FILE | cut -d' ' -f2`" + MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`" + echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports + [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST" + [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS" + return 0 +} +run_test 26b "evict dead exports" + test_27() { [ "`lsmod | grep mds`" ] || \ { echo "skipping test 27 (non-local MDS)" && return 0; } diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 6df53a8..8352be3 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -15,7 +15,7 @@ init_test_env $@ # Skip these tests # bug number: 2766 9930 -ALWAYS_EXCEPT="0b 39 $REPLAY_SINGLE_EXCEPT" +ALWAYS_EXCEPT="0b $REPLAY_SINGLE_EXCEPT" gen_config() { rm -f $XMLCONFIG diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench index 09a0549..fe80594 100755 --- a/lustre/tests/rundbench +++ b/lustre/tests/rundbench @@ -8,7 +8,7 @@ SRC=${SRC:-/usr/lib/dbench/client.txt} [ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT SRC=/usr/lib/dbench/client_plain.txt [ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT -[ ! -s $TGT ] && echo "$TGT doesn't exist" && exit 1 +[ ! -s $TGT ] && echo "$0: $TGT doesn't exist (SRC=$SRC)" && exit 1 cd $DIR echo "running 'dbench $@' on $PWD at `date`" dbench -c client.txt $@ diff --git a/lustre/tests/runregression-mds.sh b/lustre/tests/runregression-mds.sh deleted file mode 100755 index 1b05df8..0000000 --- a/lustre/tests/runregression-mds.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/sh - -SRCDIR="`dirname $0`" - -ENDRUN=endrun-`hostname` - -fail() { - echo "ERROR: $1" 1>&2 - [ $2 ] && RC=$2 || RC=1 - exit $RC -} - -export PATH=/sbin:/usr/sbin:$SRCDIR:$PATH - -cleanup() { - trap 0 - $LCONF --cleanup $OPTS -} - -[ "$COUNT" ] || COUNT=1000 - -[ "$LCONF" ] || LCONF=$SRCDIR/../utils/lconf - -[ -z "$*" ] && fail "usage: $0 [--reformat] .xml" 1 - -OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" -if [ -z "$OSCMT" ]; then - $LCONF $@ || exit 1 - trap cleanup EXIT - OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" - [ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1 -fi - -V="-10" -while [ "$1" ]; do - case $1 in - -v|--verbose) V="1";; - --reformat) : ;; - *) OPTS="$OPTS $1" ;; - esac - shift -done - -OSCTMP=`echo $OSCMT | tr "/" "."` -USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` -USED=`expr $USED + 16` # Some space for the status file - -THREADS=1 -while [ $THREADS -lt 196 ]; do - echo "starting $THREADS threads at `date`" - [ $V -gt 0 ] || echo 0 > /proc/sys/lnet/debug - $SRCDIR/createdestroy /mnt/lustre/file-$$ $COUNT $V $THREADS - $SRCDIR/openclose /mnt/lustre/file-$$ $COUNT $THREADS - THREADS=`expr $THREADS + 5` - $LCONF --cleanup $OPTS || fail 10 - $LCONF $OPTS || fail 11 -done - -rm -f $ENDRUN - -NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` -if [ $NOWUSED -gt $USED ]; then - echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2 - echo "This is normal on BA OSTs, because of subdirectories." 1>&2 -fi - -cleanup diff --git a/lustre/tests/runtests b/lustre/tests/runtests index 75f8765..0969f23 100755 --- a/lustre/tests/runtests +++ b/lustre/tests/runtests @@ -36,20 +36,22 @@ while [ "$1" ]; do shift done -MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" -if [ -z "$MOUNT" ]; then +EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`" +if [ -z "$EXISTING_MOUNT" ]; then sh llmount.sh $OPTS - MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" - [ -z "$MOUNT" ] && fail "no lustre filesystem mounted" 1 + EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`" + [ -z "$EXISTING_MOUNT" ] && fail "no lustre filesystem mounted" 1 I_MOUNTED="yes" fi +MOUNT=$EXISTING_MOUNT OSCTMP=`echo $MOUNT | tr "/" "."` USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` USED=`expr $USED + 16` # Some space for the status file # let's start slowly here... -log "touching $MOUNT" +START=`date +%s` +log "touching $MOUNT at `date`" touch $MOUNT || fail "can't touch $MOUNT" 2 HOSTS=$MOUNT/hosts.$$ @@ -79,16 +81,17 @@ mkdir $DST || fail "can't mkdir $DST" 10 # ok, that hopefully worked, so let's do a little more, with files that # haven't changed in the last day (hopefully they don't change during test) FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -n $COUNT` -log "copying files from $SRC to $DST$SRC" +log "copying files from $SRC to $DST$SRC at `date`" tar cf - $FILES | tar xvf - -C $DST || fail "copying $SRC" 11 -log "comparing newly copied files" +log "comparing newly copied files at `date`" for f in $FILES; do [ $V ] && log "verifying $DST/$f" diff -q $f $DST/$f || ERROR=11 done [ "$ERROR" ] && fail "old and new files are different" $ERROR +log "finished at `date` ($(($(date +%s) - START)))" sh llmountcleanup.sh || exit 19 sh llrmount.sh $OPTS || exit 20 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 1952835..72ecbc5 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -228,7 +228,7 @@ rm -rf $DIR/[Rdfs][1-9]* build_test_filter echo "preparing for tests involving mounts" -EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP} +EXT2_DEV=${EXT2_DEV:-$TMP/SANITY.LOOP} touch $EXT2_DEV mke2fs -j -F $EXT2_DEV 8000 > /dev/null echo # add a newline after mke2fs. @@ -585,7 +585,7 @@ test_22() { mkdir $DIR/d22 chown $RUNAS_ID $DIR/d22 # Tar gets pissy if it can't access $PWD *sigh* - (cd /tmp; + (cd $TMP; $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | \ $RUNAS tar xfC - $DIR/d22) ls -lR $DIR/d22/etc @@ -1030,7 +1030,7 @@ test_27o() { exhaust_all_precreations 0x215 sleep 5 - touch $DIR/d27/f27o && error + touch $DIR/d27/f27o && error "able to create $DIR/d27/f27o" reset_enospc } @@ -2805,37 +2805,49 @@ function get_named_value() done } +export CACHE_MAX=`cat /proc/fs/lustre/llite/*/max_cached_mb | head -n 1` +cleanup_101() { + for s in $LPROC/llite/*/max_cached_mb; do + echo $CACHE_MAX > $s + done + trap 0 +} + test_101() { local s local discard - local nreads + local nreads=10000 + local cache_limit=32 - for s in $LPROC/osc/OSC_*/rpc_stats ;do + for s in $LPROC/osc/OSC_*/rpc_stats; do echo 0 > $s done - for s in $LPROC/llite/*/read_ahead_stats ;do - echo 0 > $s + trap cleanup_101 EXIT + for s in $LPROC/llite/fs*; do + echo 0 > $s/read_ahead_stats + echo $cache_limit > $s/max_cached_mb done # - # randomly read 10000 of 64K chunks from 200M file. + # randomly read 10000 of 64K chunks from file 3x 32MB in size # - nreads=10000 - $RANDOM_READS -f $DIR/f101 -s200000000 -b65536 -C -n$nreads -t 180 + echo "nreads: $nreads file size: $((cache_limit * 3))MB" + $RANDOM_READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180 discard=0 - for s in $LPROC/llite/*/read_ahead_stats ;do - discard=$(($discard + $(cat $s | get_named_value 'read but discarded'))) + for s in $LPROC/llite/fs*; do + discard=$(($discard + $(cat $s/read_ahead_stats | get_named_value 'read but discarded'))) done + cleanup_101 if [ $(($discard * 10)) -gt $nreads ] ;then cat $LPROC/osc/OSC_*/rpc_stats cat $LPROC/llite/*/read_ahead_stats error "too many ($discard) discarded pages" fi - rm -f $DIR/f101 || true + rm -f $DIR/$tfile || true } -run_test 101 "check read-ahead for random reads ===========" +run_test 101 "check read-ahead for random reads ================" test_102() { local testfile=$DIR/xattr_testfile @@ -2844,7 +2856,7 @@ test_102() { touch $testfile [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return - [ -z "`grep \ $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return + [ -z "`grep xattr $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return echo "set/get xattr..." setfattr -n trusted.name1 -v value1 $testfile || error [ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \ @@ -2880,7 +2892,7 @@ test_102() { rm -f $testfile } -run_test 102 "user xattr test =====================" +run_test 102 "user xattr test ==================================" run_acl_subtest() { @@ -2896,7 +2908,7 @@ test_103 () { [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return [ -z "`mount | grep " $DIR .*\"`" ] && echo "skipping $TESTNAME (must have acl)" && return [ -z "`grep acl $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return - $(which setfacl 2>/dev/null) || echo "skipping $TESTNAME (could not find setfacl)" && return + which setfacl 2>/dev/null || (echo "skipping $TESTNAME (could not find setfacl)" && return) echo "performing cp ..." run_acl_subtest cp || error @@ -2920,14 +2932,14 @@ test_103 () { cd $SAVED_PWD umask $SAVE_UMASK } -run_test 103 "==============acl test =============" +run_test 103 "acl test =========================================" test_104() { touch $DIR/$tfile lfs df || error "lfs df failed" lfs df -ih || error "lfs df -ih failed" - lfs df $DIR || error "lfs df $DIR failed" - lfs df -ih $DIR || error "lfs df -ih $DIR failed" + lfs df -h $DIR || error "lfs df -h $DIR failed" + lfs df -i $DIR || error "lfs df -i $DIR failed" lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed" lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed" @@ -2937,7 +2949,7 @@ test_104() { lctl --device %$OSC recover lfs df || error "lfs df with reactivated OSC failed" } -run_test 104 "lfs>df [-ih] [path] test ============" +run_test 104 "lfs df [-ih] [path] test =========================" TMPDIR=$OLDTMPDIR TMP=$OLDTMP diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index a1df23a..57cfaa8 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -519,6 +519,45 @@ test_23() { # Bug 5972 } run_test 23 " others should see updated atime while another read====" +test_24() { + touch $DIR1/$tfile + lfs df || error "lfs df failed" + lfs df -ih || error "lfs df -ih failed" + lfs df -h $DIR1 || error "lfs df -h $DIR1 failed" + lfs df -i $DIR2 || error "lfs df -i $DIR2 failed" + lfs df $DIR1/$tfile || error "lfs df $DIR1/$tfile failed" + lfs df -ih $DIR2/$tfile || error "lfs df -ih $DIR2/$tfile failed" + + OSC=`lctl dl | awk '/OSC.*MNT/ {print $4}' | head -n 1` + lctl --device %$OSC deactivate + lfs df -i || error "lfs df -i with deactivated OSC failed" + lctl --device %$OSC recover + lfs df || error "lfs df with reactivated OSC failed" +} +run_test 24 "lfs df [-ih] [path] test =========================" + +test_25() { + [ -z "`mount | grep " $DIR1 .*\"`" ] && echo "skipping $TESTNAME ($DIR1 must have acl)" && return + [ -z "`mount | grep " $DIR2 .*\"`" ] && echo "skipping $TESTNAME ($DIR2 must have acl)" && return + + mkdir $DIR1/d25 || error + touch $DIR1/d25/f1 || error + chmod 0755 $DIR1/d25/f1 || error + + $RUNAS checkstat $DIR2/d25/f1 || error + setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error + $RUNAS checkstat $DIR2/d25/f1 && error + setfacl -m u:$RUNAS_ID:r-x $DIR1/d25 || error + $RUNAS checkstat $DIR2/d25/f1 || error + setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error + $RUNAS checkstat $DIR2/d25/f1 && error + setfacl -x u:$RUNAS_ID: $DIR1/d25 || error + $RUNAS checkstat $DIR2/d25/f1 || error + + rm -rf $DIR1/d25 +} +run_test 25 "change ACL on one mountpoint be seen on another ===" + log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true diff --git a/lustre/utils/Lustre/lustredb.py b/lustre/utils/Lustre/lustredb.py index 3283153..82409e1 100644 --- a/lustre/utils/Lustre/lustredb.py +++ b/lustre/utils/Lustre/lustredb.py @@ -478,6 +478,8 @@ class LustreDB_LDAP(LustreDB): def _get_val(self, k): ret = None + if k == 'name': + k = 'lustreName' if self._attrs.has_key(k): v = self._attrs[k] if type(v) == types.ListType: diff --git a/lustre/utils/l_getgroups.c b/lustre/utils/l_getgroups.c index 61c87ee..de4bac0 100644 --- a/lustre/utils/l_getgroups.c +++ b/lustre/utils/l_getgroups.c @@ -128,14 +128,15 @@ int main(int argc, char **argv) else progname++; - if (strcmp(argv[1], "-d") == 0) - debug = 1; - if (argc != 3) { fprintf(stderr, "%s: bad parameter count\n", progname); usage(stderr); return EINVAL; } + + if (strcmp(argv[1], "-d") == 0) + debug = 1; + param->mgd_uid = strtoul(argv[2], &end, 0); if (*end) { fprintf(stderr, "%s: invalid uid '%s'\n", progname, argv[2]); diff --git a/lustre/utils/lconf b/lustre/utils/lconf index c550731..44419e8 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1253,7 +1253,10 @@ class MDSDEV(Module): self.nspath = self.db.get_val('nspath', '') self.mkfsoptions = '-i 4096 ' + self.db.get_val('mkfsoptions', '') self.mountfsoptions = self.db.get_val('mountfsoptions', '') - self.quota = self.db.get_val('quota', '') + if config.quota: + self.quota = config.quota + else: + self.quota = self.db.get_val('quota', '') # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid target_uuid = self.db.get_first_ref('target') mds = self.db.lookup(target_uuid) @@ -1511,14 +1514,19 @@ class OSD(Module): self.journal_size = self.db.get_val_int('journalsize', 0) # now as we store fids in EA on OST we need to make inode bigger - self.inode_size = self.db.get_val_int('inodesize', 256) + self.inode_size = self.db.get_val_int('inodesize', 0) + if self.inode_size == 0: + self.inode_size = 256 self.mkfsoptions = self.db.get_val('mkfsoptions', '') # Allocate fewer inodes on large OST devices. Most filesystems # can be much more aggressive than this, but by default we can't. if self.size > 1000000: self.mkfsoptions = '-i 16384 ' + self.mkfsoptions self.mountfsoptions = self.db.get_val('mountfsoptions', '') - self.quota = self.db.get_val('quota', '') + if config.quota: + self.quota = config.quota + else: + self.quota = self.db.get_val('quota', '') self.fstype = self.db.get_val('fstype', '') if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs': @@ -1734,7 +1742,6 @@ class Client(Module): else: for srv in this_nets: lctl.connect(srv) - break if srv: lctl.add_conn(self.name, srv.nid_uuid); @@ -1787,8 +1794,10 @@ class COBD(Module): # virtual interface for OSC and LOV class VOSC(Module): - def __init__(self, db, uuid, fs_name, name_override = None): + def __init__(self, db, uuid, fs_name, name_override = None, quota = None): Module.__init__(self, 'VOSC', db) + if quota: + self.add_lustre_module('quota', 'lquota') if db.get_class() == 'lov': self.osc = LOV(db, uuid, fs_name, name_override) else: @@ -1802,9 +1811,11 @@ class VOSC(Module): def cleanup(self): self.osc.cleanup() def load_module(self): + Module.load_module(self) self.osc.load_module() def cleanup_module(self): self.osc.cleanup_module() + Module.cleanup_module(self) class ECHO_CLIENT(Module): @@ -1874,16 +1885,17 @@ class Mountpoint(Module): self.fs_uuid = self.db.get_first_ref('filesystem') fs = self.db.lookup(self.fs_uuid) self.mds_uuid = fs.get_first_ref('mds') + mds_db = self.db.lookup(self.mds_uuid) + if config.quota: + quota = config.quota + else: + quota = mds_db.get_val('quota', config.quota) self.obd_uuid = fs.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) client_uuid = generate_client_uuid(self.name) - self.vosc = VOSC(obd, client_uuid, self.name) + self.vosc = VOSC(obd, client_uuid, self.name, quota=quota) self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid) - mds_db = self.db.lookup(self.mds_uuid) - quota = mds_db.get_val('quota', '') - if quota: - self.add_lustre_module('quota', 'lquota') self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('llite', 'llite') @@ -2742,6 +2754,7 @@ lconf_options = [ PARAMLIST), ('user_xattr', """Enable user_xattr support on MDS""", FLAG, 0), ('acl', """Enable ACL support on MDS""", FLAG, 0), + ('quota', "Enable quota support for client file system", PARAM), ] def main(): diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 2a8fbf5..aa27001 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -384,42 +384,36 @@ static int path2mnt(char *path, FILE *fp, char *mntdir, int dir_len) if (out_len > 0) return 0; - + fprintf(stderr, "error: lfs df: %s isn't mounted on lustre\n", path); return -EINVAL; } static int showdf(char *mntdir, struct obd_statfs *stat, - struct obd_uuid *uuid, int ishow, int cooked, + char *uuid, int ishow, int cooked, char *type, int index, int rc) { __u64 avail, used, total; double ratio = 0; - int obd_type; char *suffix = "KMGTPEZY"; char tbuf[10], ubuf[10], abuf[10], rbuf[10]; - if (!uuid || !stat || !type) - return -EINVAL; - if (!strncmp(type, "MDT", 3)) { - obd_type = 0; - } else if(!strncmp(type, "OST", 3)){ - obd_type = 1; - } else { - fprintf(stderr, "error: lfs df: invalid type '%s'\n", type); + if (!uuid || !stat) return -EINVAL; - } - if (rc == 0) { + switch (rc) { + case 0: if (ishow) { avail = stat->os_ffree; used = stat->os_files - stat->os_ffree; total = stat->os_files; } else { - avail = stat->os_bavail * stat->os_bsize / 1024; + int shift = cooked ? 0 : 10; + + avail = (stat->os_bavail * stat->os_bsize) >> shift; used = stat->os_blocks - stat->os_bavail; - used = used * stat->os_bsize / 1024; - total = stat->os_blocks * stat->os_bsize / 1024; + used = (used * stat->os_bsize) >> shift; + total = (stat->os_blocks * stat->os_bsize) >> shift; } if (total > 0) @@ -427,26 +421,26 @@ static int showdf(char *mntdir, struct obd_statfs *stat, if (cooked) { int i; - double total_d, used_d, avail_d; - - total_d = (double)total; - i = COOK(total_d); + double cook_val; + + cook_val = (double)total; + i = COOK(cook_val); if (i > 0) - sprintf(tbuf, HDF"%c", total_d, suffix[i - 1]); + sprintf(tbuf, HDF"%c", cook_val, suffix[i - 1]); else sprintf(tbuf, CDF, total); - used_d = (double)used; - i = COOK(used_d); + cook_val = (double)used; + i = COOK(cook_val); if (i > 0) - sprintf(ubuf, HDF"%c", used_d, suffix[i - 1]); + sprintf(ubuf, HDF"%c", cook_val, suffix[i - 1]); else sprintf(ubuf, CDF, used); - avail_d = (double)avail; - i = COOK(avail_d); + cook_val = (double)avail; + i = COOK(cook_val); if (i > 0) - sprintf(abuf, HDF"%c", avail_d, suffix[i - 1]); + sprintf(abuf, HDF"%c", cook_val, suffix[i - 1]); else sprintf(abuf, CDF, avail); } else { @@ -456,23 +450,19 @@ static int showdf(char *mntdir, struct obd_statfs *stat, } sprintf(rbuf, RDF, (int)(ratio * 100)); - if (obd_type == 0) - printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[MDT:%d]\n", - (char *)uuid, tbuf, ubuf, abuf, rbuf, - mntdir, index); + printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s", + uuid, tbuf, ubuf, abuf, rbuf, mntdir); + if (type) + printf("[%s:%d]\n", type, index); else - printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[OST:%d]\n", - (char *)uuid, tbuf, ubuf, abuf, rbuf, - mntdir, index); + printf("\n"); - return 0; - } - switch (rc) { + break; case -ENODATA: - printf(UUF": inactive OST\n", (char *)uuid); + printf(UUF": inactive device\n", uuid); break; default: - printf(UUF": %s\n", (char *)uuid, strerror(-rc)); + printf(UUF": %s\n", uuid, strerror(-rc)); break; } @@ -481,12 +471,9 @@ static int showdf(char *mntdir, struct obd_statfs *stat, static int mntdf(char *mntdir, int ishow, int cooked) { - struct obd_statfs stat_buf; + struct obd_statfs stat_buf, sum = { .os_bsize = 1 }; struct obd_uuid uuid_buf; __u32 index; - __u64 avail_sum, used_sum, total_sum; - char tbuf[10], ubuf[10], abuf[10], rbuf[10]; - double ratio_sum = 0; int rc; if (ishow) @@ -495,10 +482,9 @@ static int mntdf(char *mntdir, int ishow, int cooked) "IUse%", "Mounted on"); else printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s\n", - "UUID", "1K-blocks", "Used", "Available", - "Use%", "Mounted on"); + "UUID", cooked ? "bytes" : "1K-blocks", + "Used", "Available", "Use%", "Mounted on"); - avail_sum = total_sum = 0; for (index = 0; ; index++) { memset(&stat_buf, 0, sizeof(struct obd_statfs)); memset(&uuid_buf, 0, sizeof(struct obd_uuid)); @@ -509,7 +495,7 @@ static int mntdf(char *mntdir, int ishow, int cooked) if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || rc == -ENODATA || rc == 0) { - showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked, + showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked, "MDT", index, rc); } else { fprintf(stderr, @@ -517,13 +503,13 @@ static int mntdf(char *mntdir, int ishow, int cooked) uuid_buf.uuid, strerror(-rc), rc); return rc; } - if (!rc && ishow) { - avail_sum += stat_buf.os_ffree; - total_sum += stat_buf.os_files; + if (rc == 0) { + sum.os_ffree += stat_buf.os_ffree; + sum.os_files += stat_buf.os_files; } } - for (index = 0;;index++) { + for (index = 0; ; index++) { memset(&stat_buf, 0, sizeof(struct obd_statfs)); memset(&uuid_buf, 0, sizeof(struct obd_uuid)); rc = llapi_obd_statfs(mntdir, LL_STATFS_LOV, index, @@ -533,7 +519,7 @@ static int mntdf(char *mntdir, int ishow, int cooked) if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || rc == -ENODATA || rc == 0) { - showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked, + showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked, "OST", index, rc); } else { fprintf(stderr, @@ -541,55 +527,15 @@ static int mntdf(char *mntdir, int ishow, int cooked) strerror(-rc), rc); return rc; } - if (!rc && !ishow) { - __u64 avail, total; - avail = stat_buf.os_bavail * stat_buf.os_bsize; - avail /= 1024; - total = stat_buf.os_blocks * stat_buf.os_bsize; - total /= 1024; - - avail_sum += avail; - total_sum += total; + if (rc == 0) { + sum.os_blocks += stat_buf.os_blocks * stat_buf.os_bsize; + sum.os_bfree += stat_buf.os_bfree * stat_buf.os_bsize; + sum.os_bavail += stat_buf.os_bavail * stat_buf.os_bsize; } } - used_sum = total_sum - avail_sum; - if (total_sum > 0) - ratio_sum = (double)(total_sum - avail_sum) / (double)total_sum; - sprintf(rbuf, RDF, (int)(ratio_sum * 100)); - if (cooked) { - int i; - char *suffix = "KMGTPEZY"; - double total_sum_d, used_sum_d, avail_sum_d; - - total_sum_d = (double)total_sum; - i = COOK(total_sum_d); - if (i > 0) - sprintf(tbuf, HDF"%c", total_sum_d, suffix[i - 1]); - else - sprintf(tbuf, CDF, total_sum); - - used_sum_d = (double)used_sum; - i = COOK(used_sum_d); - if (i > 0) - sprintf(ubuf, HDF"%c", used_sum_d, suffix[i - 1]); - else - sprintf(ubuf, CDF, used_sum); - - avail_sum_d = (double)avail_sum; - i = COOK(avail_sum_d); - if (i > 0) - sprintf(abuf, HDF"%c", avail_sum_d, suffix[i - 1]); - else - sprintf(abuf, CDF, avail_sum); - } else { - sprintf(tbuf, CDF, total_sum); - sprintf(ubuf, CDF, used_sum); - sprintf(abuf, CDF, avail_sum); - } - - printf("\n"UUF" "CSF" "CSF" "CSF" "RSF" %-s\n", - "filesystem summary:", tbuf, ubuf, abuf, rbuf, mntdir); + printf("\n"); + showdf(mntdir, &sum, "filesystem summary:", ishow, cooked, NULL, 0,0); return 0; } diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index fc75f21..1c10faa 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -41,7 +41,9 @@ int verbose; int nomtab; int fake; int force; +int retry; static char *progname = NULL; +#define MAX_RETRIES 99 void usage(FILE *out) { @@ -59,6 +61,7 @@ void usage(FILE *out) "\t-v|--verbose: print verbose config settings\n" "\t-o: filesystem mount options:\n" "\t\tflock/noflock: enable/disable flock support\n" + "\t\troute=[-]:[-]: portal route to MDS\n" "\t\tuser_xattr/nouser_xattr: enable/disable user extended " "attributes\n" ); @@ -115,6 +118,9 @@ update_mtab_entry(char *spec, char *mtpt, char *type, char *opts, fprintf(stderr, "%s: addmntent: %s:", progname, strerror (errno)); rc = 16; + } else if (verbose > 1) { + fprintf(stderr, "%s: added %s on %s to %s\n", + progname, spec, mtpt, MOUNTED); } endmntent(fp); } @@ -141,6 +147,7 @@ print_options(FILE *out, struct lustre_mount_data *lmd, const char *options) fprintf(out, "mds name: %s\n", lmd->lmd_mds); fprintf(out, "profile: %s\n", lmd->lmd_profile); fprintf(out, "options: %s\n", options); + fprintf(out, "retry: %d\n", retry); return 0; } @@ -243,8 +250,11 @@ int parse_options(char *options, struct lustre_mount_data *lmd, int *flagp) if ((opteq = strchr(opt, '='))) { val = atoi(opteq + 1); *opteq = '\0'; - if (0) { - /* All the network options have gone :)) */ + if (!strcmp(opt, "retry")) { + if (val >= 0 || val < MAX_RETRIES) + retry = val; + else + retry = 0; } else { fprintf(stderr, "%s: unknown option '%s'. " "Ignoring.\n", progname, opt); @@ -353,12 +363,14 @@ int main(int argc, char *const argv[]) switch (opt) { case 1: ++force; - printf("force: %d\n", force); + if (verbose) + printf("force: %d\n", force); nargs++; break; case 'f': ++fake; - printf("fake: %d\n", fake); + if (verbose) + printf("fake: %d\n", fake); nargs++; break; case 'h': @@ -366,7 +378,8 @@ int main(int argc, char *const argv[]) break; case 'n': ++nomtab; - printf("nomtab: %d\n", nomtab); + if (verbose) + printf("nomtab: %d\n", nomtab); nargs++; break; case 'o': @@ -428,15 +441,29 @@ int main(int argc, char *const argv[]) return 1; } - if (!fake) - rc = mount(source, target, "lustre", flags, (void *)&lmd); + if (!fake) { + FILE *modpipe = popen("/sbin/modprobe -q llite", "r"); + if (modpipe != NULL) + pclose(modpipe); + /* use <= to include the initial mount before we retry */ + for (i = 0, rc = -EAGAIN; i <= retry && rc != 0; i++) + rc = mount(source, target, "lustre", flags, &lmd); + } if (rc) { fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname, source, target, strerror(errno)); print_options(stderr, &lmd, options); - if (errno == ENODEV) + if (errno == ENODEV) { + struct utsname unamebuf; + char *modfile = "/etc/modutils.conf"; + + if (uname(&unamebuf) == 0 && + strncmp(unamebuf.release, "2.4", 3) == 0) + modfile = "/etc/modules.conf"; + fprintf(stderr, "Are the lustre modules loaded?\n" - "Check /etc/modules.conf and /proc/filesystems\n"); + "Check %s and /proc/filesystems\n"); + } rc = 32; } else if (!nomtab) { rc = update_mtab_entry(source, target, "lustre", options,0,0,0); diff --git a/lustre/utils/lmc b/lustre/utils/lmc index ef0c7e1..fb80016 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -201,19 +201,20 @@ lmc_options = [ ('mdsuuid', "Optional argument to specify MDS UUID", PARAM,""), ('nspath', "Local mount point of server namespace.", PARAM,""), ('format', ""), - ('quota', "quotaon:enable quota, only u|g|ug is supported now. \ - iunit: the unit for slave to acquire/release inode quota from/to masteri.\ - Int type (>0), default value in Lustre is 5000 inodes.\ - bunit: the unit for slave to acquire/release block quota from/to master.\ - Mbytes (>0), default value in Lustre is 100(Mbytes).\ - itune: used to tune the threthold. When inode quota usage reach the threthold,\ - slave should acquire/release inode quota from/to master.\ - Int type (100 > btune > 0), default value in Lustre is 50 (percentge).\ - inode threthold = iunit * itune / 100.\ - btune: used to tune the threthold. When block quota usage reach the threthold,\ - slave should acquire/release block quota from/to master.\ - Int type (100 > btune > 0), default value in Lustre is 50 (percentage).\ - block threthold = bunit * btune / 100.", PARAM,""), + ('quota', """ + quotaon: enable quota, only u|g|ug is supported now. + iunit: the unit for slave to acquire/release inode quota from/to master. + Int type (>0), default value in Lustre is 5000 inodes. + bunit: the unit for slave to acquire/release block quota from/to master. + Mbytes (>0), default value in Lustre is 100(Mbytes). + itune: used to tune the threthold. When inode quota usage reach the threthold, + slave should acquire/release inode quota from/to master. + Int type (100 > btune > 0), default value in Lustre is 50 (percentge). + inode threthold = iunit * itune / 100. + btune: used to tune the threthold. When block quota usage reach the threthold, + slave should acquire/release block quota from/to master. + Int type (100 > btune > 0), default value in Lustre is 50 (percentage). + block threthold = bunit * btune / 100.""", PARAM,""), # clients: mountpoint and echo ('echo_client', "", PARAM), ('path', "Specify the mountpoint for Lustre.", PARAM), diff --git a/lustre/utils/rmmod_all.sh b/lustre/utils/rmmod_all.sh index e948e31..9ae82bb 100755 --- a/lustre/utils/rmmod_all.sh +++ b/lustre/utils/rmmod_all.sh @@ -1,18 +1,8 @@ #!/bin/sh -rmmod llite -rmmod mdc -rmmod lov -rmmod osc -rmmod obdfilter -rmmod fsfilt_ext3 -rmmod fsfilt_ldiskfs -rmmod ldiskfs -rmmod ost -rmmod mds -rmmod ptlrpc -rmmod obdclass -rmmod lvfs -rmmod ksocklnd -rmmod lnet -rmmod libcfs +SRCDIR=`dirname $0` +PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH + +lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1 +# do it again, in case we tried to unload ksocklnd too early +lctl modules | awk '{ print $2 }' | xargs rmmod diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 0ba4cd1..27b12f7 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -367,7 +367,7 @@ check_mds_body(void) CHECK_VALUE(FMODE_READ); CHECK_VALUE(FMODE_WRITE); - CHECK_VALUE(FMODE_EXEC); + CHECK_VALUE(MDS_FMODE_EXEC); CHECK_CDEFINE(MDS_OPEN_CREAT); CHECK_CDEFINE(MDS_OPEN_EXCL); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index ee8d916..dd8664b 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -872,8 +872,8 @@ void lustre_assert_wire_constants(void) (long long)FMODE_READ); LASSERTF(FMODE_WRITE == 2, " found %lld\n", (long long)FMODE_WRITE); - LASSERTF(FMODE_EXEC == 4, " found %lld\n", - (long long)FMODE_EXEC); + LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n", + (long long)MDS_FMODE_EXEC); CLASSERT(MDS_OPEN_CREAT == 00000100); CLASSERT(MDS_OPEN_EXCL == 00000200); CLASSERT(MDS_OPEN_TRUNC == 00001000);