===================================================================
--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200
+++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
+++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300
+++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
-@@ -0,0 +1,2349 @@
+@@ -0,0 +1,2355 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300
+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
===================================================================
--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200
+++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200
+++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_old(handle, inode, block, count);
+ else {
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
int err;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
int err;
- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
/*
* Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
}
static int ext3_add_nondir(handle_t *handle,
-@@ -1706,7 +1712,7 @@
+@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t
struct ext3_dir_entry_2 * de;
int err, retries = 0;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
retry:
-@@ -1729,7 +1735,7 @@
+@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode
inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
dir_block = ext3_bread (handle, inode, 0, 1, &err);
if (!dir_block) {
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
-@@ -1761,7 +1767,7 @@
+@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode
iput (inode);
goto out_stop;
}
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
d_instantiate(dentry, inode);
-@@ -2026,10 +2032,10 @@
+@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
inode->i_version++;
inode->i_nlink = 0;
/* There's no need to set i_disksize: the fact that i_nlink is
-@@ -2039,7 +2045,7 @@
+@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
-@@ -2090,7 +2096,7 @@
+@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode
dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
if (!inode->i_nlink)
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime;
-@@ -2165,7 +2171,7 @@
+@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
retry:
-@@ -2252,8 +2258,8 @@
+@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode
if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
-@@ -2310,7 +2316,7 @@
+@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode
}
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME_SEC;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-@@ -2321,11 +2327,13 @@
+@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode
PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
ext3_journal_dirty_metadata(handle, dir_bh);
/*
* Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
with RPCs. In all cases it would slow down the write because
these RPCs are unnecessary.
+Severity : enhancement
+Bugzilla : 9340
+Description: allow number of MDS service threads to be changed at module load
+Details : It is now possible to change the number of MDS service threads
+ running. Adding "options mds mds_num_threads=N" will set the
+ number of threads for the next time Lustre is restarted (assuming
+ the "mds" module is also reloaded at that time). The default
+ number of threads will stay the same, 32 for most systems.
+
+Severity : major
+Frequency : rare
+Bugzilla : 10300
+Description: OST crash if filesystem is unformatted or corrupt
+Details : If an OST is started on a device that has never been formatted
+ or if the filesystem is corrupt and cannot even mount then the
+ error handling cleanup routines would dereference a NULL pointer.
+
+Severity : medium
+Frequency : rare
+Bugzilla : 10047
+Description: NULL pointer deref in llap_from_page.
+Details : get_cache_page_nowait can return a page with NULL (or otherwise
+ incorrect) mapping if the page was truncated/reclaimed while it was
+ searched for. Check for this condition and skip such pages when
+ doing readahead. Introduce extra check to llap_from_page() to
+ verify page->mapping->host is non-NULL (so page is not anonymous).
+
+Severity : minor
+Frequency : Sometimes when using sys_sendfile
+Bugzilla : 7020
+Description: "page not covered by a lock" warnings from ll_readpage
+Details : sendfile called ll_readpage without right page locks present.
+ Now we introduced ll_file_sendfile that does necessary locking
+ around call to generic_file_sendfile() much like we do in
+ ll_file_read().
+
+Severity : medium
+Frequency : with certain MDS communication failures at client mount time
+Bugzilla : 10268
+Description: NULL pointer deref after failed client mount
+Details : a client connection request may delayed by the network layer
+ and not be sent until after the PTLRPC layer has timed out the
+ request. If the client fails the mount immediately it will try
+ to clean up before the network times out the request. Add a
+ reference from the request import to the obd device and delay
+ the cleanup until the network drops the request.
+
+Severity : medium
+Frequency : occasionally during client (re)connect
+Bugzilla : 9387
+Description: assertion failure during client (re)connect
+Details : processing a client connection request may be delayed by the
+ client or server longer than the client connect timeout. This
+ causes the client to resend the connection request. If the
+ original connection request is replied in this interval, the
+ client may trip an assertion failure in ptlrpc_connect_interpret()
+ which thought it would be the only running connect process.
+
+Severity : medium
+Frequency : only with obd_echo servers and clients that are rebooted
+Bugzilla : 10140
+Description: kernel BUG accessing uninitialized data structure
+Details : When running an obd_echo server it did not start the ping_evictor
+ thread, and when a client was evicted an uninitialized data
+ structure was accessed. Start the ping_evictor in the RPC
+ service startup instead of the OBD startup.
+
+Severity : enhancement
+Bugzilla : 10393 (patchless)
+Description: Remove dependency on various unexported kernel interfaces.
+Details : No longer need reparent_to_init, exit_mm, exit_files,
+ sock_getsockopt, filemap_populate, FMODE_EXEC, put_filp.
+
+Severity : minor
+Frequency : rare (only users of deprecated and unsupported LDAP config)
+Bugzilla : 9337
+Description: write_conf for zeroconf mount queried LDAP incorrectly for client
+Details : LDAP apparently contains 'lustreName' attributes instead of
+ 'name'. A simple remapping of the name is sufficient.
+
+Severity : major
+Frequency : rare (only with non-default dump_on_timeout debug enabled)
+Bugzilla : 10397
+Description: waiting_locks_callback trips kernel BUG if client is evicted
+Details : Running with the dump_on_timeout debug flag turned on makes
+ it possible that the waiting_locks_callback() can try to dump
+ the Lustre kernel debug logs from an interrupt handler. Defer
+ this log dumping to the expired_lock_main() thread.
+
+Severity : enhancement
+Bugzilla : 10420
+Description: Support NFS exporting on 2.6 kernels.
+Details : Implement non-rawops metadata methods for NFS server to use without
+ changing NFS server code.
+
+Severity : medium
+Frequency : very rare (synthetic metadata workload only)
+Bugzilla : 9974
+Description: two racing renames might cause an MDS thread to deadlock
+Details : Running the "racer" program may cause one MDS thread to rename
+ a file from being the source of a rename to being the target of
+ a rename at exactly the same time that another thread is doing
+ so, and the second thread has already enqueued these locks after
+ doing a lookup of the target and is trying to relock them in
+ order. Ensure that we don't try to re-lock the same resource.
+
+Severity : major
+Frequency : only very large systems with liblustre clients
+Bugzilla : 7304
+Description: slow eviction of liblustre clients with the "evict_by_nid" RPC
+Details : Use asynchronous set_info RPCs to send the "evict_by_nid" to
+ all OSTs in parallel. This allows the eviction of stale liblustre
+ clients to proceed much faster than if they were done in series,
+ and also offers similar improvements for other set_info RPCs.
+
+Severity : minor
+Bugzilla : 10265
+Description: excessive CPU usage during initial read phase on client
+Details : During the initial read phase on a client, it would agressively
+ retry readahead on the file, consuming too much CPU and impacting
+ performance (since 1.4.5.8). Improve the readahead algorithm
+ to avoid this, and also improve some other common cases (read
+ of small files in particular, where "small" is files smaller than
+ /proc/fs/lustre/llite/*/max_read_ahead_whole_mb, 2MB by default).
+
+Severity : minor
+Bugzilla : 10450
+Description: MDS crash when receiving packet with unknown intent.
+Details : Do not LBUG in unknown intent case, just return -EFAULT
+
+
------------------------------------------------------------------------------
02-14-2006 Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.4.6
* WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT
- INTEROPERATE with older versions automatically. Please read the
+ INTEROPERATE with older versions automatically. Please read the
user documentation before upgrading any part of a live system.
* WARNING: Lustre networking configuration changes are required with
this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052
for details.
* bug fixes
- * Support for newer kernels: 2.6.9-22.0.2.EL (RHEL 4),
- 2.6.5-7.244 (SLES 9) - same as 1.4.5.2.
- 2.6.12.6 vanilla (kernel.org)
+ * Support for newer kernels:
+ 2.6.9-22.0.2.EL (RHEL 4),
+ 2.6.5-7.244 (SLES 9) - same as 1.4.5.2.
+ 2.6.12.6 vanilla (kernel.org)
Severity : enhancement
created for this new infrastructure.
Severity : enhancement
+Description: Introduced Access control lists
+Details : clients can set ACLs on files and directories in order to have
+ more fine-grained permissions than the standard Unix UGO+RWX.
+ The MDS must be started with the "-o acl" mount option.
+
+Severity : enhancement
+Description: Introduced filesystem quotas
+Details : Administrators may now establish per-user quotas on the
+ filesystem.
+
+Severity : enhancement
Bugzilla : 7982
Description: Configuration change for the XT3
The PTLLND is now used to run Lustre over Portals on the XT3
])
])
+AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL],
+[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct file_operations fops;
+ &fops.unlocked_ioctl;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_FILEMAP_POPULATE],
+[AC_MSG_CHECKING([for exported filemap_populate])
+LB_LINUX_TRY_COMPILE([
+ #include <asm/page.h>
+ #include <linux/mm.h>
+],[
+ filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0);
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_D_ADD_UNIQUE],
+[AC_MSG_CHECKING([for d_add_unique])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/dcache.h>
+],[
+ d_add_unique(NULL, NULL);
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique])
+],[
+ AC_MSG_RESULT([no])
+])
+])
#
# LC_PROG_LINUX
LC_FUNC_FILEMAP_FDATAWRITE
LC_STRUCT_STATFS
LC_FUNC_PAGE_MAPPED
+LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL
+LC_FILEMAP_POPULATE
+LC_D_ADD_UNIQUE
])
#
#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
#define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0)
+#ifndef page_private
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+#endif
+
+
static inline void inter_module_put(void *a)
{
return;
time_t ia_ctime;
unsigned int ia_attr_flags;
};
+#define ll_iattr_struct iattr
#define IT_OPEN 0x0001
#define IT_CREAT 0x0002
#include <libcfs/linux/portals_compat25.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+struct ll_iattr_struct {
+ struct iattr iattr;
+ unsigned int ia_attr_flags;
+};
+#else
+#define ll_iattr_struct iattr
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+#define UNLOCK_INODE_MUTEX(inode) do {mutex_unlock(&(inode)->i_mutex); } while(0)
+#define LOCK_INODE_MUTEX(inode) do {mutex_lock(&(inode)->i_mutex); } while(0)
+#define TRYLOCK_INODE_MUTEX(inode) mutex_trylock(&(inode)->i_mutex)
+#define d_child d_u.d_child
+#define d_rcu d_u.d_rcu
+#else
+#define UNLOCK_INODE_MUTEX(inode) do {up(&(inode)->i_sem); } while(0)
+#define LOCK_INODE_MUTEX(inode) do {down(&(inode)->i_sem); } while(0)
+#define TRYLOCK_INODE_MUTEX(inode) (!down_trylock(&(inode)->i_sem))
+#endif
+
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
#define NGROUPS_SMALL NGROUPS
#define NGROUPS_PER_BLOCK ((int)(EXEC_PAGESIZE / sizeof(gid_t)))
#endif
+#ifndef page_private
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
+#define gfp_t int
+#endif
+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
#define lock_dentry(___dentry) spin_lock(&(___dentry)->d_lock)
#include <linux/writeback.h>
-static inline void lustre_daemonize_helper(void)
-{
- LASSERT(current->signal != NULL);
- current->signal->session = 1;
- if (current->group_leader)
- current->group_leader->signal->pgrp = 1;
- else
- CERROR("we aren't group leader\n");
- current->signal->tty = NULL;
-}
-
static inline int cleanup_group_info(void)
{
struct group_info *ginfo;
do { \
page_cache_get(page); \
SetPagePrivate(page); \
- page->private = (unsigned long)llap; \
+ set_page_private(page, (unsigned long)llap); \
} while (0)
#define __clear_page_ll_data(page) \
do { \
ClearPagePrivate(page); \
- page->private = 0; \
+ set_page_private(page, 0); \
page_cache_release(page); \
} while(0)
#define ILOOKUP(sb, ino, test, data) ilookup4(sb, ino, test, data);
#define DCACHE_DISCONNECTED DCACHE_NFSD_DISCONNECTED
#define ll_dev_t int
+#define old_encode_dev(dev) (dev)
/* 2.5 uses hlists for some things, like the d_hash. we'll treat them
* as 2.5 and let macros drop back.. */
static inline void __d_drop(struct dentry *dentry)
{
- list_del(&dentry->d_hash);
- INIT_LIST_HEAD(&dentry->d_hash);
-}
-
-static inline void lustre_daemonize_helper(void)
-{
- current->session = 1;
- current->pgrp = 1;
- current->tty = NULL;
+ list_del_init(&dentry->d_hash);
}
static inline int cleanup_group_info(void)
#define PDE(ii) ((ii)->u.generic_ip)
#endif
-#define __set_page_ll_data(page, llap) page->private = (unsigned long)llap
-#define __clear_page_ll_data(page) page->private = 0
+#define __set_page_ll_data(page, llap) set_page_private(page, (unsigned long)llap)
+#define __clear_page_ll_data(page) set_page_private(page, 0)
#define PageWriteback(page) 0
#define set_page_writeback(page) do {} while (0)
#define end_page_writeback(page) do {} while (0)
}
#endif /* !HAVE_PAGE_MAPPED */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16))
+static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
+{
+ update_atime(dentry->d_inode);
+}
+#endif
+
static inline void file_accessed(struct file *file)
{
#ifdef O_NOATIME
if (file->f_flags & O_NOATIME)
return;
#endif
- update_atime(file->f_dentry->d_inode);
+ touch_atime(file->f_vfsmnt, file->f_dentry);
}
#endif /* end of 2.4 compat macros */
#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \
CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
fmt, page, page->mapping, page->index, (long)page->flags, \
- page_count(page), page->private, ## arg)
+ page_count(page), page_private(page), ## arg)
#else
#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \
CDEBUG(mask, "page %p index %lu priv %0lx: "\
- fmt, page, page->index, page->private, ## arg)
+ fmt, page, page->index, page_private(page), ## arg)
#endif
#endif
do { \
if (time_before(jiffies, start + 15 * HZ)) \
break; \
+ else if (time_before(jiffies, start + 30 * HZ)) \
+ CDEBUG(D_VFSTRACE,"slow %s %lus\n", msg,(jiffies-start)/HZ);\
else if (time_before(jiffies, start + timeout / 2 * HZ)) \
CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ); \
else \
{
struct dentry *dchild;
- down(&dparent->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dparent->d_inode);
dchild = lookup_one_len(fid_name, dparent, fid_namelen);
- up(&dparent->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dparent->d_inode);
if (IS_ERR(dchild) || dchild->d_inode == NULL)
return dchild;
#define OBD_CONNECT_TRANSNO 0x800ULL /* replay is sending initial transno */
#define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */
#define OBD_CONNECT_JOIN 0x2000ULL /* files can be concatenated */
+#define OBD_CONNECT_NODEVOH 0x8000ULL /* No open handle for special nodes */
/* also update obd_connect_names[] for lprocfs_rd_connect_flags() */
#define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
- OBD_CONNECT_IBITS | OBD_CONNECT_JOIN)
+ OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
+ OBD_CONNECT_NODEVOH)
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX)
#define FMODE_READ 00000001
#define FMODE_WRITE 00000002
#endif
-#ifndef FMODE_EXEC
-#define FMODE_EXEC 00000004
-#endif
+#define MDS_FMODE_EXEC 00000004
#define MDS_OPEN_CREAT 00000100
#define MDS_OPEN_EXCL 00000200
#define MDS_OPEN_TRUNC 00001000
#define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
#endif
+struct obd_statfs;
+
#define LL_IOC_GETFLAGS _IOR ('f', 151, long)
#define LL_IOC_SETFLAGS _IOW ('f', 152, long)
#define LL_IOC_CLRFLAGS _IOW ('f', 153, long)
#define LL_STATFS_LOV 2
#define IOC_MDC_TYPE 'i'
+#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
#define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *)
#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_mds_data *)
void ptlrpc_activate_import(struct obd_import *imp);
void ptlrpc_deactivate_import(struct obd_import *imp);
void ptlrpc_invalidate_import(struct obd_import *imp);
-void ptlrpc_fail_import(struct obd_import *imp, int generation);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
#endif
struct obd_device *imp_obd;
cfs_waitq_t imp_recovery_waitq;
- __u64 imp_last_replay_transno;
+
atomic_t imp_inflight;
atomic_t imp_replay_inflight;
enum lustre_imp_state imp_state;
int imp_generation;
__u32 imp_conn_cnt;
- __u64 imp_max_transno;
+ int imp_last_generation_checked;
+ __u64 imp_last_replay_transno;
__u64 imp_peer_committed_transno;
- struct obd_uuid imp_target_uuid; /* XXX -> lustre_name */
+ __u64 imp_last_transno_checked;
struct lustre_handle imp_remote_handle;
cfs_time_t imp_next_ping; /* jiffies */
__u32 imp_connect_op;
struct obd_connect_data imp_connect_data;
__u64 imp_connect_flags_orig;
+
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
};
typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
*/
#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
-#define LDLM_NBUFS 64
+#define LDLM_NBUFS (64 * smp_num_cpus)
#define LDLM_BUFSIZE (8 * 1024)
#define LDLM_MAXREQSIZE (5 * 1024)
#define LDLM_MAXREPSIZE (1024)
-#define MDT_MAX_THREADS 32UL
-#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
- MDT_MAX_THREADS), 2UL)
+#define MDS_MAX_THREADS 512UL
+#define MDS_DEF_THREADS max(2UL, min_t(unsigned long, 32, \
+ num_physpages * smp_num_cpus >> (26 - PAGE_SHIFT)))
#define MDS_NBUFS (64 * smp_num_cpus)
#define MDS_BUFSIZE (8 * 1024)
/* Assume file name length = FNAME_MAX = 256 (true for ext3).
REQ_FLAGS_FMT"/%x/%x rc %d/%d\n" , ## args, req, req->rq_xid, \
req->rq_transno, \
req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \
- req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>", \
+ req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : "<?>", \
req->rq_import ? \
(char *)req->rq_import->imp_connection->c_remote_uuid.uuid : "<?>", \
(req->rq_import && req->rq_import->imp_client) ? \
char *name, int id);
int ptlrpc_unregister_service(struct ptlrpc_service *service);
int liblustre_check_services (void *arg);
-void ptlrpc_daemonize(void);
+void ptlrpc_daemonize(char *name);
int ptlrpc_service_health_check(struct ptlrpc_service *);
/* ptlrpc/pinger.c */
int ptlrpc_pinger_add_import(struct obd_import *imp);
int ptlrpc_pinger_del_import(struct obd_import *imp);
+#ifdef __KERNEL__
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+#else
+#define ping_evictor_start() do {} while (0)
+#define ping_evictor_stop() do {} while (0)
+#endif
/* ptlrpc/ptlrpcd.c */
void ptlrpcd_wake(struct ptlrpc_request *req);
#define IOC_MDC_TYPE 'i'
#define IOC_MDC_MIN_NR 20
-#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
/* Moved to lustre_user.h
+#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
#define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */
#define IOC_MDC_MAX_NR 50
enum async_flags {
ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
page is added to an rpc */
- ASYNC_URGENT = 0x2,
+ ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
to give the caller a chance to update
or cancel the size of the io */
#define OSC_MAX_DIRTY_MB_MAX 2048 /* totally arbitrary */
struct mdc_rpc_lock;
+struct obd_import;
struct client_obd {
- struct obd_import *cl_import;
struct semaphore cl_sem;
+ struct obd_uuid cl_target_uuid;
+ struct obd_import *cl_import; /* ptlrpc connection state */
int cl_conn_count;
/* max_mds_easize is purely a performance thing so we don't have to
* call obd_size_diskmd() all the time. */
/* used by quotacheck */
int cl_qchk_stat; /* quotacheck stat of the peer */
- struct ptlrpc_request_pool *cl_rq_pool; /* emergency pool of requests */
};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */
#define OBD_LLOG_FL_SENDNOW 0x0001
+enum obd_cleanup_stage {
/* Special case hack for MDS LOVs */
-#define OBD_CLEANUP_EARLY 0
+ OBD_CLEANUP_EARLY,
/* Precleanup stage 1, we must make sure all exports (other than the
self-export) get destroyed. */
-#define OBD_CLEANUP_EXPORTS 1
+ OBD_CLEANUP_EXPORTS,
/* Precleanup stage 2, do other type-specific cleanup requiring the
self-export. */
-#define OBD_CLEANUP_SELF_EXP 2
+ OBD_CLEANUP_SELF_EXP,
/* FIXME we should eliminate the "precleanup" function and make them stages
of the "cleanup" function. */
-#define OBD_CLEANUP_OBD 3
+ OBD_CLEANUP_OBD,
+};
struct obd_ops {
struct module *o_owner;
void *karg, void *uarg);
int (*o_get_info)(struct obd_export *, __u32 keylen, void *key,
__u32 *vallen, void *val);
- int (*o_set_info)(struct obd_export *, __u32 keylen, void *key,
- __u32 vallen, void *val);
+ int (*o_set_info_async)(struct obd_export *, __u32 keylen, void *key,
+ __u32 vallen, void *val,
+ struct ptlrpc_request_set *set);
int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
int (*o_detach)(struct obd_device *dev);
int (*o_setup) (struct obd_device *dev, obd_count len, void *data);
- int (*o_precleanup)(struct obd_device *dev, int cleanup_stage);
+ int (*o_precleanup)(struct obd_device *dev,
+ enum obd_cleanup_stage cleanup_stage);
int (*o_cleanup)(struct obd_device *dev);
int (*o_process_config)(struct obd_device *dev, obd_count len,
void *data);
struct oig_callback_context *occ, int rc);
void oig_release(struct obd_io_group *oig);
int oig_wait(struct obd_io_group *oig);
-/* ping evictor */
-#ifdef __KERNEL__
-void ping_evictor_start(void);
-void ping_evictor_stop(void);
-#else
-#define ping_evictor_start() do {} while (0)
-#define ping_evictor_stop() do {} while (0)
-#endif
-
char *obd_export_nid2str(struct obd_export *exp);
int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd);
void class_decref(struct obd_device *obd);
/* Passed as data param to class_config_parse_llog */
struct obd_export *class_new_export(struct obd_device *obddev,
struct obd_uuid *cluuid);
void class_unlink_export(struct obd_export *exp);
-void class_update_export_timer(struct obd_export *exp, time_t extra_delay);
struct obd_import *class_import_get(struct obd_import *);
void class_import_put(struct obd_import *);
-struct obd_import *class_new_import(void);
+struct obd_import *class_new_import(struct obd_device *obd);
void class_destroy_import(struct obd_import *exp);
struct obd_type *class_get_type(char *name);
RETURN(rc);
}
-static inline int obd_set_info(struct obd_export *exp, obd_count keylen,
- void *key, obd_count vallen, void *val)
+static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen,
+ void *key, obd_count vallen, void *val,
+ struct ptlrpc_request_set *set)
{
int rc;
ENTRY;
- EXP_CHECK_OP(exp, set_info);
- OBD_COUNTER_INCREMENT(exp->exp_obd, set_info);
+ EXP_CHECK_OP(exp, set_info_async);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, set_info_async);
- rc = OBP(exp->exp_obd, set_info)(exp, keylen, key, vallen, val);
+ rc = OBP(exp->exp_obd, set_info_async)(exp, keylen, key, vallen, val,
+ set);
RETURN(rc);
}
RETURN(rc);
}
-static inline int obd_precleanup(struct obd_device *obd, int cleanup_stage)
+static inline int obd_precleanup(struct obd_device *obd,
+ enum obd_cleanup_stage cleanup_stage)
{
int rc;
ENTRY;
* <shaver> // XXX do not look into _superhack with remaining eye
* <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
/* sysctl.c */
extern void obd_sysctl_init (void);
extern unsigned int ldlm_timeout;
extern unsigned int obd_health_check_timeout;
extern char obd_lustre_upcall[128];
-extern unsigned int obd_sync_filter;
extern cfs_waitq_t obd_race_waitq;
#define OBD_FAIL_MDS 0x100
#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503
#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504
#define OBD_FAIL_PTLRPC_DROP_RPC 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
#define OBD_FAIL_OBD_PING_NET 0x600
#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
#
# Processor type and features
#
-CONFIG_MK8=y
+# CONFIG_MK8
# CONFIG_IA32E is not set
-# CONFIG_GENERIC_CPU is not set
+CONFIG_GENERIC_CPU=y
CONFIG_X86_L1_CACHE_BYTES=64
CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_TSC=y
#
# Processor type and features
#
-CONFIG_MK8=y
+# CONFIG_MK8 is not set
# CONFIG_IA32E is not set
-# CONFIG_GENERIC_CPU is not set
+CONFIG_GENERIC_CPU=y
CONFIG_X86_L1_CACHE_BYTES=64
CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_TSC=y
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_ISCSI_ATTRS=m
+CONFIG_SAS_CLASS=m
+# CONFIG_SAS_DEBUG is not set
#
# SCSI low-level drivers
# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
CONFIG_AIC7XXX_DEBUG_MASK=0
# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
+# CONFIG_SCSI_AIC94XX is not set
CONFIG_SCSI_AIC7XXX_OLD=m
CONFIG_SCSI_AIC79XX=m
CONFIG_AIC79XX_CMDS_PER_DEVICE=4
CONFIG_MEGARAID_NEWGEN=y
CONFIG_MEGARAID_MM=m
CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_SAS=m
CONFIG_SCSI_SATA=y
CONFIG_SCSI_SATA_AHCI=m
CONFIG_SCSI_SATA_SVW=m
#
# Fusion MPT device support
#
-CONFIG_FUSION=m
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
CONFIG_FUSION_MAX_SGE=40
CONFIG_FUSION_CTL=m
CONFIG_FUSION_LAN=m
+CONFIG_FUSION_OLD_MODULE_COMPAT=m
#
# IEEE 1394 (FireWire) support
# CONFIG_YELLOWFIN is not set
CONFIG_R8169=m
CONFIG_R8169_NAPI=y
+CONFIG_SKY2=m
CONFIG_SK98LIN=m
CONFIG_VIA_VELOCITY=m
CONFIG_TIGON3=m
+CONFIG_BNX2=m
#
# Ethernet (10000 Mbit)
# Active AVM cards
#
CONFIG_CAPI_AVM=y
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
#
# Active Eicon DIVA Server cards
#
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_JSM is not set
CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
CONFIG_CRASH=m
# CONFIG_USB_GADGET is not set
#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# EDAC - error detection and reporting (RAS)
+#
+# CONFIG_EDAC is not set
+
+#
# Firmware Drivers
#
CONFIG_EDD=m
+CONFIG_DELL_RBU=m
#
# File systems
CONFIG_LOCKD=m
CONFIG_LOCKD_V4=y
CONFIG_EXPORTFS=m
+CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=m
CONFIG_SUNRPC_GSS=m
CONFIG_RPCSEC_GSS_KRB5=m
# CONFIG_MINIX_FS is not set
# CONFIG_ROMFS_FS is not set
CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
CONFIG_QFMT_V2=y
CONFIG_QUOTACTL=y
CONFIG_DNOTIFY=y
+++ /dev/null
-Index: linux-2.6.7/mm/filemap.c
-===================================================================
---- linux-2.6.7.orig/mm/filemap.c 2004-11-15 12:02:35.000000000 +0800
-+++ linux-2.6.7/mm/filemap.c 2004-11-15 12:04:38.000000000 +0800
-@@ -1409,6 +1409,7 @@
-
- return 0;
- }
-+EXPORT_SYMBOL_GPL(filemap_populate);
-
- static struct vm_operations_struct generic_file_vm_ops = {
- .nopage = filemap_nopage,
-Index: linux-2.6.7/include/linux/mm.h
-===================================================================
---- linux-2.6.7.orig/include/linux/mm.h 2004-11-15 12:02:43.000000000 +0800
-+++ linux-2.6.7/include/linux/mm.h 2004-11-15 12:04:23.000000000 +0800
-@@ -661,6 +661,8 @@
-
- /* generic vm_area_ops exported for stackable file systems */
- struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-+int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long,
-+ pgprot_t, unsigned long, int);
-
- /* mm/page-writeback.c */
- int write_one_page(struct page *page, int wait);
/*
* second extended-fs super-block data in memory
*/
-Index: linux-2.6.9-5.0.3.EL/net/core/sock.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/net/core/sock.c 2005-02-26 13:24:35.490810168 +0200
-+++ linux-2.6.9-5.0.3.EL/net/core/sock.c 2005-02-26 13:53:13.801587224 +0200
-@@ -602,6 +602,7 @@
- return -EFAULT;
- return 0;
- }
-+EXPORT_SYMBOL(sock_getsockopt);
-
- static kmem_cache_t *sk_cachep;
-
Index: linux-2.6.9-5.0.3.EL/fs/namespace.c
===================================================================
--- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-26 13:47:31.282658016 +0200
void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current;
-@@ -428,6 +430,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
-@@ -516,6 +516,7 @@
- {
- __exit_mm(tsk);
- }
-+EXPORT_SYMBOL(exit_mm);
-
- static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
- {
Index: linux-2.6.9-5.0.3.EL/fs/dcache.c
===================================================================
--- linux-2.6.9-5.0.3.EL.orig/fs/dcache.c 2005-02-26 13:49:04.365507272 +0200
void d_genocide(struct dentry *root)
{
-Index: linux-2.6.9-5.0.3.EL/mm/filemap.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/mm/filemap.c 2005-02-26 13:24:35.502808344 +0200
-+++ linux-2.6.9-5.0.3.EL/mm/filemap.c 2005-02-26 13:53:59.787596288 +0200
-@@ -1473,7 +1473,7 @@
- return NULL;
- }
-
--static int filemap_populate(struct vm_area_struct *vma,
-+int filemap_populate(struct vm_area_struct *vma,
- unsigned long addr,
- unsigned long len,
- pgprot_t prot,
-@@ -1520,6 +1520,7 @@
-
- return 0;
- }
-+EXPORT_SYMBOL_GPL(filemap_populate);
-
- struct vm_operations_struct generic_file_vm_ops = {
- .nopage = filemap_nopage,
-Index: linux-2.6.9-5.0.3.EL/fs/file_table.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/fs/file_table.c 2005-02-26 13:24:35.512806824 +0200
-+++ linux-2.6.9-5.0.3.EL/fs/file_table.c 2005-02-26 13:53:13.811585704 +0200
-@@ -196,6 +196,7 @@
- file_free(file);
- }
- }
-+EXPORT_SYMBOL(put_filp);
-
- void file_move(struct file *file, struct list_head *list)
- {
-Index: linux-2.6.9-5.0.3.EL/include/linux/mm.h
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/include/linux/mm.h 2005-02-26 13:49:05.823285656 +0200
-+++ linux-2.6.9-5.0.3.EL/include/linux/mm.h 2005-02-26 13:53:54.181448552 +0200
-@@ -721,6 +721,9 @@
-
- /* generic vm_area_ops exported for stackable file systems */
- struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-+int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
-+ unsigned long len, pgprot_t prot, unsigned long pgoff,
-+ int nonblock);
-
- /* mm/page-writeback.c */
- int write_one_page(struct page *page, int wait);
void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current;
-@@ -429,6 +431,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
-Index: linux-2.6.12-rc6/net/core/sock.c
-===================================================================
---- linux-2.6.12-rc6.orig/net/core/sock.c 2005-06-06 17:22:29.000000000 +0200
-+++ linux-2.6.12-rc6/net/core/sock.c 2005-06-14 15:53:58.349304101 +0200
-@@ -613,6 +613,7 @@
- return -EFAULT;
- return 0;
- }
-+EXPORT_SYMBOL(sock_getsockopt);
-
- /**
- * sk_alloc - All socket objects are allocated here
Index: linux-2.6.12-rc6/fs/namespace.c
===================================================================
--- linux-2.6.12-rc6.orig/fs/namespace.c 2005-06-14 15:53:17.868835847 +0200
void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current;
-@@ -432,6 +434,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
-@@ -515,6 +515,7 @@
- task_unlock(tsk);
- mmput(mm);
- }
-+EXPORT_SYMBOL(exit_mm);
-
- static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
- {
Index: linux-2.6.12-rc6/fs/dcache.c
===================================================================
--- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200
void d_genocide(struct dentry *root)
{
-Index: linux-2.6.12-rc6/fs/file_table.c
-===================================================================
---- linux-2.6.12-rc6.orig/fs/file_table.c 2005-06-06 17:22:29.000000000 +0200
-+++ linux-2.6.12-rc6/fs/file_table.c 2005-06-14 15:53:58.396179101 +0200
-@@ -197,6 +197,7 @@
- file_free(file);
- }
- }
-+EXPORT_SYMBOL(put_filp);
-
- void file_move(struct file *file, struct list_head *list)
- {
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
===================================================================
--- linux-2.4.21-rhel.orig/include/linux/ext3_extents.h 2005-03-02 22:42:20.659360368 +0300
+++ linux-2.4.21-rhel/include/linux/ext3_extents.h 2005-03-04 02:34:52.000000000 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
===================================================================
--- linux-2.4.21-suse2.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
+++ linux-2.4.21-suse2/include/linux/ext3_extents.h 2004-11-02 20:34:00.000000000 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+
+#endif /* _LINUX_EXT3_EXTENTS */
-+
Index: linux-2.4.21-suse2/include/linux/ext3_fs_i.h
===================================================================
--- linux-2.4.21-suse2.orig/include/linux/ext3_fs_i.h 2004-11-02 20:31:37.000000000 +0300
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
===================================================================
--- linux-2.4.24.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
+++ linux-2.4.24/include/linux/ext3_extents.h 2004-11-02 20:32:17.000000000 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
===================================================================
--- linux-2.4.29.orig/include/linux/ext3_extents.h 2005-05-03 16:52:08.724069800 +0300
+++ linux-2.4.29/include/linux/ext3_extents.h 2005-05-03 16:52:08.819055360 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
===================================================================
--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200
+++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
+++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300
+++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
-@@ -0,0 +1,2349 @@
+@@ -0,0 +1,2355 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300
+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
===================================================================
--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200
+++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
+/*
+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
+{
-+ struct ext3_extent_header *neh;
-+ neh = EXT_ROOT_HDR(tree);
-+ neh->eh_generation++;
++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++ (EXT_GENERATION(neh) + 1);
+}
+
+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
-+ if (ext3_ext_check_header(eh))
++ if (ext3_ext_check_header(eh)) {
++ /* don't free previously allocated path
++ * -- caller should take care */
++ path = NULL;
+ goto err;
++ }
+
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+
+err:
+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
+ return ERR_PTR(-EIO);
+}
+
===================================================================
--- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200
+++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ __u16 eh_entries; /* number of valid entries */
+ __u16 eh_max; /* capacity of store in entries */
+ __u16 eh_depth; /* has tree real underlaying blocks? */
-+ __u32 eh_generation; /* generation of the tree */
++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */
+};
+
+#define EXT3_EXT_MAGIC 0xf30a
+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */
+
-+#define EXT_ROOT_HDR(tree) \
-+ ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+ ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_) \
-+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_old(handle, inode, block, count);
+ else {
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
int err;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
return -EPERM;
- if (inode->i_nlink >= EXT3_LINK_MAX) {
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
- }
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
/*
* Macro-instructions used to manage several block sizes
-@@ -580,14 +580,15 @@
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
int err;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
return -EPERM;
- if (inode->i_nlink >= EXT3_LINK_MAX) {
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
- }
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
/*
* Macro-instructions used to manage several block sizes
-@@ -582,14 +582,15 @@
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
int err;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
return -EPERM;
- if (inode->i_nlink >= EXT3_LINK_MAX) {
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
- }
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
/*
* Macro-instructions used to manage several block sizes
-@@ -581,14 +581,15 @@
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
int err;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
int err;
- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
/*
* Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
}
static int ext3_add_nondir(handle_t *handle,
-@@ -1706,7 +1712,7 @@
+@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t
struct ext3_dir_entry_2 * de;
int err, retries = 0;
- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
++ if (EXT3_DIR_LINK_MAX(dir))
return -EMLINK;
retry:
-@@ -1729,7 +1735,7 @@
+@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode
inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
dir_block = ext3_bread (handle, inode, 0, 1, &err);
if (!dir_block) {
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
-@@ -1761,7 +1767,7 @@
+@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode
iput (inode);
goto out_stop;
}
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
d_instantiate(dentry, inode);
-@@ -2026,10 +2032,10 @@
+@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
inode->i_version++;
inode->i_nlink = 0;
/* There's no need to set i_disksize: the fact that i_nlink is
-@@ -2039,7 +2045,7 @@
+@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
-@@ -2090,7 +2096,7 @@
+@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode
dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
if (!inode->i_nlink)
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime;
-@@ -2165,7 +2171,7 @@
+@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(inode))
++ if (EXT3_DIR_LINK_MAX(inode))
return -EMLINK;
retry:
-@@ -2252,8 +2258,8 @@
+@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode
if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
++ EXT3_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
-@@ -2310,7 +2316,7 @@
+@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode
}
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME_SEC;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-@@ -2321,11 +2327,13 @@
+@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode
PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
ext3_journal_dirty_metadata(handle, dir_bh);
/*
* Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
--- /dev/null
+Index: linux-2.6.9/fs/attr.c
+===================================================================
+--- linux-2.6.9/fs.orig/attr.c 2006-03-10 17:20:39.000000000 +0200
++++ linux-2.6.9/fs/attr.c 2006-04-09 01:21:44.000000000 +0300
+@@ -177,6 +177,9 @@
+ if (!attr->ia_valid)
+ return 0;
+
++ if (ia_valid & ATTR_SIZE)
++ down_write(&dentry->d_inode->i_alloc_sem);
++
+ if (inode->i_op && inode->i_op->setattr) {
+ audit_notify_watch(inode, MAY_WRITE);
+ error = security_inode_setattr(dentry, attr);
+@@ -194,6 +197,10 @@
+ error = inode_setattr(inode, attr);
+ }
+ }
++
++ if (ia_valid & ATTR_SIZE)
++ up_write(&dentry->d_inode->i_alloc_sem);
++
+ if (!error) {
+ unsigned long dn_mask = setattr_mask(ia_valid);
+ if (dn_mask)
+Index: linux-2.6.9/fs/open.c
+===================================================================
+--- linux-2.6.9/fs.orig/open.c 2006-04-09 01:18:08.000000000 +0300
++++ linux-2.6.9/fs/open.c 2006-04-09 01:22:29.000000000 +0300
+@@ -205,16 +205,16 @@
+ newattrs.ia_size = length;
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+ down(&dentry->d_inode->i_sem);
+- down_write(&dentry->d_inode->i_alloc_sem);
+ if (called_from_open)
+ newattrs.ia_valid |= ATTR_FROM_OPEN;
+ if (op->setattr_raw) {
+ newattrs.ia_valid |= ATTR_RAW;
+ newattrs.ia_ctime = CURRENT_TIME;
++ down_write(&dentry->d_inode->i_alloc_sem);
+ err = op->setattr_raw(dentry->d_inode, &newattrs);
++ up_write(&dentry->d_inode->i_alloc_sem);
+ } else
+ err = notify_change(dentry, &newattrs);
+- up_write(&dentry->d_inode->i_alloc_sem);
+ up(&dentry->d_inode->i_sem);
+ return err;
+ }
-Index: uml/fs/cifs/dir.c
+Index: linux-2.6.10/fs/cifs/dir.c
===================================================================
---- uml.orig/fs/cifs/dir.c 2004-12-24 16:35:01.000000000 -0500
-+++ uml/fs/cifs/dir.c 2005-04-13 23:43:03.681625568 -0400
-@@ -199,23 +199,23 @@
+--- linux-2.6.10.orig/fs/cifs/dir.c
++++ linux-2.6.10/fs/cifs/dir.c
+@@ -199,23 +199,23 @@ cifs_create(struct inode *inode, struct
}
if(nd) {
disposition = FILE_OPEN_IF;
else {
cFYI(1,("Create flag not set in create function"));
-Index: uml/fs/nfs/nfs4proc.c
+Index: linux-2.6.10/fs/nfs/nfs4proc.c
===================================================================
---- uml.orig/fs/nfs/nfs4proc.c 2004-12-24 16:35:23.000000000 -0500
-+++ uml/fs/nfs/nfs4proc.c 2005-04-13 23:43:26.409770503 -0400
-@@ -775,17 +775,17 @@
+--- linux-2.6.10.orig/fs/nfs/nfs4proc.c
++++ linux-2.6.10/fs/nfs/nfs4proc.c
+@@ -775,17 +775,17 @@ nfs4_atomic_open(struct inode *dir, stru
struct nfs4_state *state;
if (nd->flags & LOOKUP_CREATE) {
put_rpccred(cred);
if (IS_ERR(state))
return (struct inode *)state;
-Index: uml/fs/nfs/dir.c
+Index: linux-2.6.10/fs/nfs/dir.c
===================================================================
---- uml.orig/fs/nfs/dir.c 2005-04-13 23:42:21.792883770 -0400
-+++ uml/fs/nfs/dir.c 2005-04-13 23:43:03.685625066 -0400
-@@ -791,7 +791,7 @@
+--- linux-2.6.10.orig/fs/nfs/dir.c
++++ linux-2.6.10/fs/nfs/dir.c
+@@ -718,7 +718,7 @@ int nfs_is_exclusive_create(struct inode
+ return 0;
+ if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
+ return 0;
+- return (nd->intent.open.flags & O_EXCL) != 0;
++ return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+@@ -791,7 +791,7 @@ static int is_atomic_open(struct inode *
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
/* Are we trying to write to a read only partition? */
return 0;
return 1;
}
-@@ -812,7 +812,7 @@
+@@ -812,7 +812,7 @@ static struct dentry *nfs_atomic_lookup(
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
/* Let vfs_create() deal with O_EXCL */
goto no_entry;
/* Open the file on the server */
-@@ -820,7 +820,7 @@
+@@ -820,7 +820,7 @@ static struct dentry *nfs_atomic_lookup(
/* Revalidate parent directory attribute cache */
nfs_revalidate_inode(NFS_SERVER(dir), dir);
nfs_begin_data_update(dir);
inode = nfs4_atomic_open(dir, dentry, nd);
nfs_end_data_update(dir);
-@@ -836,7 +836,7 @@
+@@ -836,7 +836,7 @@ static struct dentry *nfs_atomic_lookup(
break;
/* This turned out not to be a regular file */
case -ELOOP:
goto no_open;
/* case -EISDIR: */
/* case -EINVAL: */
-@@ -875,7 +875,7 @@
+@@ -875,7 +875,7 @@ static int nfs_open_revalidate(struct de
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
goto no_open;
/* We cannot do exclusive creation on a positive dentry */
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
goto no_open;
+@@ -1043,7 +1043,8 @@ static int nfs_create(struct inode *dir,
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd && (nd->flags & LOOKUP_CREATE))
+- open_flags = nd->intent.open.flags;
++ open_flags = nd->intent.it_flags;
++
+
+ /*
+ * The 0 argument passed into the create function should one day
===================================================================
--- linux-2.6.5-7.108.orig/fs/nfs/dir.c 2004-09-15 19:26:43.012732408 +0300
+++ linux-2.6.5-7.108/fs/nfs/dir.c 2004-09-15 20:03:32.882781096 +0300
+@@ -709,7 +709,7 @@
+ return 0;
+ if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
+ return 0;
+- return (nd->intent.open.flags & O_EXCL) != 0;
++ return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -782,7 +782,7 @@
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
if (openflags & O_CREAT) {
/* If this is a negative dentry, just drop it */
if (!inode)
+@@ -1026,7 +1026,7 @@
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd && (nd->flags & LOOKUP_CREATE))
+- open_flags = nd->intent.open.flags;
++ open_flags = nd->intent.it_flags;
+
+ /*
+ * The 0 argument passed into the create function should one day
Index: linux-2.6.5-7.108/fs/nfs/nfs4proc.c
===================================================================
--- linux-2.6.5-7.108.orig/fs/nfs/nfs4proc.c 2004-04-04 06:37:39.000000000 +0300
===================================================================
--- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200
+++ linux-2.6.12-rc6/fs/nfs/dir.c 2005-06-14 14:26:39.884524523 +0200
+@@ -727,7 +727,7 @@
+ return 0;
+ if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+ return 0;
+- return (nd->intent.open.flags & O_EXCL) != 0;
++ return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -783,7 +783,7 @@
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
/* We cannot do exclusive creation on a positive dentry */
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
goto no_open;
+@@ -1028,7 +1028,7 @@
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd && (nd->flags & LOOKUP_CREATE))
+- open_flags = nd->intent.open.flags;
++ open_flags = nd->intent.it_flags;
+
+ lock_kernel();
+ nfs_begin_data_update(dir);
Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c
===================================================================
--- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c 2005-06-06 17:22:29.000000000 +0200
--- /dev/null
+diff -Nur linux-2.6.12.6-orig/include/linux/skbuff.h linux-2.6.12.6/include/linux/skbuff.h
+--- linux-2.6.12.6-orig/include/linux/skbuff.h 2006-03-14 19:40:26.000000000 +0800
++++ linux-2.6.12.6/include/linux/skbuff.h 2006-03-16 17:04:51.000000000 +0800
+@@ -128,6 +128,30 @@
+ __u16 size;
+ };
+
++/* Support for callback when skb data has been released */
++typedef struct zccd /* Zero Copy Callback Descriptor */
++{ /* (embed as first member of custom struct) */
++ atomic_t zccd_count; /* reference count */
++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++ atomic_set (&d->zccd_count, 1);
++ d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d) /* take a reference */
++{
++ atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d) /* release a reference */
++{
++ if (atomic_dec_and_test (&d->zccd_count))
++ (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+ * the end of the header data, ie. at skb->end.
+ */
+@@ -137,6 +161,13 @@
+ unsigned short tso_size;
+ unsigned short tso_segs;
+ struct sk_buff *frag_list;
++ zccd_t *zccd; /* zero copy descriptor */
++ zccd_t *zccd2; /* 2nd zero copy descriptor */
++ /* NB we expect zero-copy data to be at least 1 packet, so
++ * having 2 zccds means we don't unneccessarily split the packet
++ * where consecutive zero-copy sends abutt.
++ */
++
+ skb_frag_t frags[MAX_SKB_FRAGS];
+ };
+
+diff -Nur linux-2.6.12.6-orig/include/net/tcp.h linux-2.6.12.6/include/net/tcp.h
+--- linux-2.6.12.6-orig/include/net/tcp.h 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/include/net/tcp.h 2006-03-16 17:05:02.000000000 +0800
+@@ -783,6 +783,9 @@
+ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t size);
+ extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd);
++
+
+ extern int tcp_ioctl(struct sock *sk,
+ int cmd,
+@@ -879,6 +882,9 @@
+ struct msghdr *msg,
+ size_t len, int nonblock,
+ int flags, int *addr_len);
++extern int tcp_recvpackets(struct sock *sk,
++ struct sk_buff_head *packets,
++ int len, int nonblock);
+
+ extern int tcp_listen_start(struct sock *sk);
+
+diff -Nur linux-2.6.12.6-orig/net/core/dev.c linux-2.6.12.6/net/core/dev.c
+--- linux-2.6.12.6-orig/net/core/dev.c 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/core/dev.c 2006-03-16 17:04:36.000000000 +0800
+@@ -1176,6 +1176,9 @@
+ ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+ ninfo->nr_frags = 0;
+ ninfo->frag_list = NULL;
++ ninfo->zccd = NULL; /* copied data => no user zero copy descriptor */
++ ninfo->zccd2 = NULL;
++
+
+ /* Offset between the two in bytes */
+ offset = data - skb->head;
+diff -Nur linux-2.6.12.6-orig/net/core/skbuff.c linux-2.6.12.6/net/core/skbuff.c
+--- linux-2.6.12.6-orig/net/core/skbuff.c 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/core/skbuff.c 2006-03-16 17:04:41.000000000 +0800
+@@ -159,6 +159,9 @@
+ skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
++ skb_shinfo(skb)->zccd2 = NULL;
++
+ out:
+ return skb;
+ nodata:
+@@ -247,6 +250,10 @@
+ if (!skb->cloned ||
+ !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &skb_shinfo(skb)->dataref)) {
++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -529,6 +536,14 @@
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+@@ -571,6 +586,9 @@
+ u8 *data;
+ int size = nhead + (skb->end - skb->head) + ntail;
+ long off;
++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
++
+
+ if (skb_shared(skb))
+ BUG();
+@@ -592,6 +610,11 @@
+ if (skb_shinfo(skb)->frag_list)
+ skb_clone_fraglist(skb);
+
++ if (zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (zccd); /* extra ref (pages are shared) */
++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (zccd2); /* extra ref (pages are shared) */
++
+ skb_release_data(skb);
+
+ off = (data + nhead) - skb->head;
+@@ -606,6 +629,8 @@
+ skb->cloned = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
++ skb_shinfo(skb)->zccd = zccd;
++ skb_shinfo(skb)->zccd2 = zccd2;
+ return 0;
+
+ nodata:
+diff -Nur linux-2.6.12.6-orig/net/ipv4/tcp.c linux-2.6.12.6/net/ipv4/tcp.c
+--- linux-2.6.12.6-orig/net/ipv4/tcp.c 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/ipv4/tcp.c 2006-03-16 17:04:57.000000000 +0800
+@@ -630,8 +630,10 @@
+ }
+ }
+
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
+ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+- size_t psize, int flags)
++ size_t psize, int flags, zccd_t *zccd)
++
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now;
+@@ -678,6 +680,17 @@
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
++
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++ skb_shinfo(skb)->zccd2 != NULL &&
++ skb_shinfo(skb)->zccd != zccd && /* not the same one */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ tcp_mark_push (tp, skb);
++ goto new_segment;
++ }
++
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ tcp_mark_push(tp, skb);
+@@ -694,6 +707,20 @@
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ zccd_get (zccd); /* bump ref count */
++
++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++ skb_shinfo(skb)->zccd = zccd;
++ else
++ skb_shinfo(skb)->zccd2 = zccd;
++ }
++
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+@@ -762,12 +789,37 @@
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+- res = do_tcp_sendpages(sk, &page, offset, size, flags);
++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return res;
++}
++
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd)
++{
++ ssize_t res;
++ struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++ BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++ lock_sock(sk);
++ TCP_CHECK_TIMER(sk);
++
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return res;
+ }
+
++
+ #define TCP_PAGE(sk) (sk->sk_sndmsg_page)
+ #define TCP_OFF(sk) (sk->sk_sndmsg_off)
+
+@@ -1530,6 +1582,202 @@
+ goto out;
+ }
+
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++ int len, int nonblock)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ int copied;
++ long timeo;
++
++ BUG_TRAP (len > 0);
++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++ lock_sock(sk);
++
++ TCP_CHECK_TIMER(sk);
++
++ copied = -ENOTCONN;
++ if (sk->sk_state == TCP_LISTEN)
++ goto out;
++
++ copied = 0;
++ timeo = sock_rcvtimeo(sk, nonblock);
++
++ do {
++ struct sk_buff * skb;
++ u32 offset;
++ unsigned long used;
++ int exhausted;
++ int eaten;
++
++ /* Are we at urgent data? Stop if we have read anything. */
++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++ break;
++
++ /* We need to check signals first, to get correct SIGURG
++ * handling. FIXME: Need to check this doesnt impact 1003.1g
++ * and move it down to the bottom of the loop
++ */
++ if (signal_pending(current)) {
++ if (copied)
++ break;
++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++ break;
++ }
++
++ /* Next get a buffer. */
++
++ skb = skb_peek(&sk->sk_receive_queue);
++
++ if (skb == NULL) /* nothing ready */
++ {
++ if (copied) {
++ if (sk->sk_err ||
++ sk->sk_state == TCP_CLOSE ||
++ (sk->sk_shutdown & RCV_SHUTDOWN) ||
++ !timeo ||
++ (0))
++ break;
++ } else {
++ if (sock_flag(sk, SOCK_DONE))
++ break;
++
++ if (sk->sk_err) {
++ copied = sock_error(sk);
++ break;
++ }
++
++ if (sk->sk_shutdown & RCV_SHUTDOWN)
++ break;
++
++ if (sk->sk_state == TCP_CLOSE) {
++ if (!(sock_flag(sk, SOCK_DONE))) {
++ /* This occurs when user tries to read
++ * from never connected socket.
++ */
++ copied = -ENOTCONN;
++ break;
++ }
++ break;
++ }
++
++ if (!timeo) {
++ copied = -EAGAIN;
++ break;
++ }
++ }
++
++ cleanup_rbuf(sk, copied);
++ sk_wait_data(sk, &timeo);
++ continue;
++ }
++
++ BUG_TRAP (atomic_read (&skb->users) == 1);
++
++ exhausted = eaten = 0;
++
++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++ if (skb->h.th->syn)
++ offset--;
++
++ used = skb->len - offset;
++
++ if (tp->urg_data) {
++ u32 urg_offset = tp->urg_seq - tp->copied_seq;
++ if (urg_offset < used) {
++ if (!urg_offset) { /* at urgent date */
++ if (!(sock_flag(sk, SOCK_URGINLINE))) {
++ tp->copied_seq++; /* discard the single byte of urgent data */
++ offset++;
++ used--;
++ }
++ } else /* truncate read */
++ used = urg_offset;
++ }
++ }
++
++ BUG_TRAP (used >= 0);
++ if (len < used)
++ used = len;
++
++ if (used == 0)
++ exhausted = 1;
++ else
++ {
++ if (skb_is_nonlinear (skb))
++ {
++ int rc = skb_linearize (skb, GFP_KERNEL);
++
++ printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++ if (rc)
++ {
++ if (!copied)
++ copied = rc;
++ break;
++ }
++ }
++
++ if ((offset + used) == skb->len) /* consuming the whole packet */
++ {
++ __skb_unlink (skb, &sk->sk_receive_queue);
++ dst_release (skb->dst);
++ skb_orphan (skb);
++ __skb_pull (skb, offset);
++ __skb_queue_tail (packets, skb);
++ exhausted = eaten = 1;
++ }
++ else /* consuming only part of the packet */
++ {
++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++ if (skb2 == NULL)
++ {
++ if (!copied)
++ copied = -ENOMEM;
++ break;
++ }
++
++ dst_release (skb2->dst);
++ __skb_pull (skb2, offset);
++ __skb_trim (skb2, used);
++ __skb_queue_tail (packets, skb2);
++ }
++
++ tp->copied_seq += used;
++ copied += used;
++ len -= used;
++ }
++
++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++ tp->urg_data = 0;
++ tcp_fast_path_check(sk, tp);
++ }
++
++ if (!exhausted)
++ continue;
++
++ if (skb->h.th->fin)
++ {
++ tp->copied_seq++;
++ if (!eaten)
++ sk_eat_skb (sk, skb);
++ break;
++ }
++
++ if (!eaten)
++ sk_eat_skb (sk, skb);
++
++ } while (len > 0);
++
++ out:
++ /* Clean up data we have read: This will do ACK frames. */
++ cleanup_rbuf(sk, copied);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return copied;
++}
++
+ /*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+@@ -2380,6 +2628,8 @@
+ EXPORT_SYMBOL(tcp_recvmsg);
+ EXPORT_SYMBOL(tcp_sendmsg);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+ EXPORT_SYMBOL(tcp_setsockopt);
+ EXPORT_SYMBOL(tcp_shutdown);
+ EXPORT_SYMBOL(tcp_statistics);
--- /dev/null
+diff -Nur linux-2.6.5-7.244-orig/include/linux/skbuff.h linux-2.6.5-7.244/include/linux/skbuff.h
+--- linux-2.6.5-7.244-orig/include/linux/skbuff.h 2005-12-13 07:50:31.000000000 +0800
++++ linux-2.6.5-7.244/include/linux/skbuff.h 2006-03-13 16:31:30.000000000 +0800
+@@ -135,6 +135,30 @@
+ __u16 size;
+ };
+
++/* Support for callback when skb data has been released */
++typedef struct zccd /* Zero Copy Callback Descriptor */
++{ /* (embed as first member of custom struct) */
++ atomic_t zccd_count; /* reference count */
++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++ atomic_set (&d->zccd_count, 1);
++ d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d) /* take a reference */
++{
++ atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d) /* release a reference */
++{
++ if (atomic_dec_and_test (&d->zccd_count))
++ (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+ * the end of the header data, ie. at skb->end.
+ */
+@@ -144,6 +168,12 @@
+ unsigned short tso_size;
+ unsigned short tso_segs;
+ struct sk_buff *frag_list;
++ zccd_t *zccd; /* zero copy descriptor */
++ zccd_t *zccd2; /* 2nd zero copy descriptor */
++ /* NB we expect zero-copy data to be at least 1 packet, so
++ * having 2 zccds means we don't unneccessarily split the packet
++ * where consecutive zero-copy sends abutt.
++ */
+ skb_frag_t frags[MAX_SKB_FRAGS];
+ };
+
+diff -Nur linux-2.6.5-7.244-orig/include/net/sock.h linux-2.6.5-7.244/include/net/sock.h
+--- linux-2.6.5-7.244-orig/include/net/sock.h 2005-12-13 07:50:33.000000000 +0800
++++ linux-2.6.5-7.244/include/net/sock.h 2006-03-13 16:32:36.000000000 +0800
+@@ -413,6 +413,18 @@
+ (__skb)->next = NULL; \
+ } while(0)
+
++#define sk_wait_event(__sk, __timeo, __condition) \
++({ int rc; \
++ release_sock(__sk); \
++ rc = __condition; \
++ if (!rc) { \
++ *(__timeo) = schedule_timeout(*(__timeo)); \
++ rc = __condition; \
++ } \
++ lock_sock(__sk); \
++ rc; \
++})
++
+ /* IP protocol blocks we attach to sockets.
+ * socket layer -> transport layer interface
+ * transport -> network interface is defined by struct inet_proto
+@@ -1037,6 +1049,20 @@
+ sk->sk_stamp = *stamp;
+ }
+
++/**
++ * sk_eat_skb - Release a skb if it is no longer needed
++ * @sk - socket to eat this skb from
++ * @skb - socket buffer to eat
++ *
++ * This routine must be called with interrupts disabled or with the socket
++ * locked so that the sk_buff queue operation is ok.
++*/
++static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
++{
++ __skb_unlink(skb, &sk->sk_receive_queue);
++ __kfree_skb(skb);
++}
++
+ extern atomic_t netstamp_needed;
+ extern void sock_enable_timestamp(struct sock *sk);
+ extern void sock_disable_timestamp(struct sock *sk);
+diff -Nur linux-2.6.5-7.244-orig/include/net/tcp.h linux-2.6.5-7.244/include/net/tcp.h
+--- linux-2.6.5-7.244-orig/include/net/tcp.h 2005-12-13 07:50:21.000000000 +0800
++++ linux-2.6.5-7.244/include/net/tcp.h 2006-03-13 16:31:37.000000000 +0800
+@@ -764,6 +764,9 @@
+ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t size);
+ extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd);
++
+
+ extern int tcp_ioctl(struct sock *sk,
+ int cmd,
+@@ -861,6 +864,10 @@
+ size_t len, int nonblock,
+ int flags, int *addr_len);
+
++extern int tcp_recvpackets(struct sock *sk,
++ struct sk_buff_head *packets,
++ int len, int nonblock);
++
+ extern int tcp_listen_start(struct sock *sk);
+
+ extern void tcp_parse_options(struct sk_buff *skb,
+diff -Nur linux-2.6.5-7.244-orig/net/core/dev.c linux-2.6.5-7.244/net/core/dev.c
+--- linux-2.6.5-7.244-orig/net/core/dev.c 2005-12-13 07:50:38.000000000 +0800
++++ linux-2.6.5-7.244/net/core/dev.c 2006-03-13 16:31:56.000000000 +0800
+@@ -1322,6 +1322,9 @@
+ ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+ ninfo->nr_frags = 0;
+ ninfo->frag_list = NULL;
++ ninfo->zccd = NULL; /* copied data => no user zero copy descriptor */
++ ninfo->zccd2 = NULL;
++
+
+ /* Offset between the two in bytes */
+ offset = data - skb->head;
+diff -Nur linux-2.6.5-7.244-orig/net/core/skbuff.c linux-2.6.5-7.244/net/core/skbuff.c
+--- linux-2.6.5-7.244-orig/net/core/skbuff.c 2004-04-04 11:37:37.000000000 +0800
++++ linux-2.6.5-7.244/net/core/skbuff.c 2006-03-13 16:31:46.000000000 +0800
+@@ -152,6 +152,9 @@
+ skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
++ skb_shinfo(skb)->zccd2 = NULL;
++
+ out:
+ return skb;
+ nodata:
+@@ -186,6 +189,10 @@
+ {
+ if (!skb->cloned ||
+ atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -449,6 +456,14 @@
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+@@ -493,6 +508,9 @@
+ u8 *data;
+ int size = nhead + (skb->end - skb->head) + ntail;
+ long off;
++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
++
+
+ if (skb_shared(skb))
+ BUG();
+@@ -514,6 +532,11 @@
+ if (skb_shinfo(skb)->frag_list)
+ skb_clone_fraglist(skb);
+
++ if (zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (zccd); /* extra ref (pages are shared) */
++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (zccd2); /* extra ref (pages are shared) */
++
+ skb_release_data(skb);
+
+ off = (data + nhead) - skb->head;
+@@ -527,6 +550,9 @@
+ skb->nh.raw += off;
+ skb->cloned = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
++ skb_shinfo(skb)->zccd = zccd;
++ skb_shinfo(skb)->zccd2 = zccd2;
++
+ return 0;
+
+ nodata:
+diff -Nur linux-2.6.5-7.244-orig/net/core/sock.c linux-2.6.5-7.244/net/core/sock.c
+--- linux-2.6.5-7.244-orig/net/core/sock.c 2005-12-13 07:50:10.000000000 +0800
++++ linux-2.6.5-7.244/net/core/sock.c 2006-03-13 16:32:44.000000000 +0800
+@@ -917,6 +917,31 @@
+ } while((skb = sk->sk_backlog.head) != NULL);
+ }
+
++/**
++ * sk_wait_data - wait for data to arrive at sk_receive_queue
++ * sk - sock to wait on
++ * timeo - for how long
++ *
++ * Now socket state including sk->sk_err is changed only under lock,
++ * hence we may omit checks after joining wait queue.
++ * We check receive queue before schedule() only as optimization;
++ * it is very likely that release_sock() added new data.
++ */
++int sk_wait_data(struct sock *sk, long *timeo)
++{
++ int rc;
++ DEFINE_WAIT(wait);
++
++ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
++ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
++ rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
++ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
++ finish_wait(sk->sk_sleep, &wait);
++ return rc;
++}
++
++EXPORT_SYMBOL(sk_wait_data);
++
+ /*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+diff -Nur linux-2.6.5-7.244-orig/net/ipv4/tcp.c linux-2.6.5-7.244/net/ipv4/tcp.c
+--- linux-2.6.5-7.244-orig/net/ipv4/tcp.c 2005-12-13 07:50:28.000000000 +0800
++++ linux-2.6.5-7.244/net/ipv4/tcp.c 2006-03-13 16:32:04.000000000 +0800
+@@ -799,7 +799,7 @@
+ }
+
+ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+- size_t psize, int flags);
++ size_t psize, int flags,zccd_t *zccd);
+
+ static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
+ int off)
+@@ -881,8 +881,9 @@
+ return err;
+ }
+
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
+ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+- size_t psize, int flags)
++ size_t psize, int flags,zccd_t *zccd)
+ {
+ struct tcp_opt *tp = tcp_sk(sk);
+ int mss_now;
+@@ -929,6 +930,17 @@
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
++
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++ skb_shinfo(skb)->zccd2 != NULL &&
++ skb_shinfo(skb)->zccd != zccd && /* not the same one */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ tcp_mark_push (tp, skb);
++ goto new_segment;
++ }
++
+ if (can_coalesce(skb, i, page, offset)) {
+ skb_shinfo(skb)->frags[i - 1].size += copy;
+ } else if (i < MAX_SKB_FRAGS) {
+@@ -939,6 +951,20 @@
+ goto new_segment;
+ }
+
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ zccd_get (zccd); /* bump ref count */
++
++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++ skb_shinfo(skb)->zccd = zccd;
++ else
++ skb_shinfo(skb)->zccd2 = zccd;
++ }
++
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->ip_summed = CHECKSUM_HW;
+@@ -1003,12 +1029,36 @@
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+- res = do_tcp_sendpages(sk, &page, offset, size, flags);
++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return res;
+ }
+
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd)
++{
++ ssize_t res;
++ struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++ BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++ lock_sock(sk);
++ TCP_CHECK_TIMER(sk);
++
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return res;
++}
++
++
+ #define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
+ #define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
+
+@@ -1849,6 +1899,202 @@
+ err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+ goto out;
+ }
++
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++int len, int nonblock)
++{
++ struct tcp_opt *tp = tcp_sk(sk);
++ int copied;
++ long timeo;
++
++ BUG_TRAP (len > 0);
++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++ lock_sock(sk);
++
++ TCP_CHECK_TIMER(sk);
++
++ copied = -ENOTCONN;
++ if (sk->sk_state == TCP_LISTEN)
++ goto out;
++
++ copied = 0;
++ timeo = sock_rcvtimeo(sk, nonblock);
++
++ do {
++ struct sk_buff * skb;
++ u32 offset;
++ unsigned long used;
++ int exhausted;
++ int eaten;
++
++ /* Are we at urgent data? Stop if we have read anything. */
++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++ break;
++
++ /* We need to check signals first, to get correct SIGURG
++ * handling. FIXME: Need to check this doesnt impact 1003.1g
++ * and move it down to the bottom of the loop
++ */
++ if (signal_pending(current)) {
++ if (copied)
++ break;
++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++ break;
++ }
++
++ /* Next get a buffer. */
++
++ skb = skb_peek(&sk->sk_receive_queue);
++
++ if (skb == NULL) /* nothing ready */
++ {
++ if (copied) {
++ if (sk->sk_err ||
++ sk->sk_state == TCP_CLOSE ||
++ (sk->sk_shutdown & RCV_SHUTDOWN) ||
++ !timeo ||
++ (0))
++ break;
++ } else {
++ if (sock_flag(sk, SOCK_DONE))
++ break;
++
++ if (sk->sk_err) {
++ copied = sock_error(sk);
++ break;
++ }
++
++ if (sk->sk_shutdown & RCV_SHUTDOWN)
++ break;
++
++ if (sk->sk_state == TCP_CLOSE) {
++ if (!(sock_flag(sk, SOCK_DONE))) {
++ /* This occurs when user tries to read
++ * from never connected socket.
++ */
++ copied = -ENOTCONN;
++ break;
++ }
++ break;
++ }
++
++ if (!timeo) {
++ copied = -EAGAIN;
++ break;
++ }
++ }
++
++ cleanup_rbuf(sk, copied);
++ sk_wait_data(sk, &timeo);
++ continue;
++ }
++
++ BUG_TRAP (atomic_read (&skb->users) == 1);
++
++ exhausted = eaten = 0;
++
++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++ if (skb->h.th->syn)
++ offset--;
++
++ used = skb->len - offset;
++
++ if (tp->urg_data) {
++ u32 urg_offset = tp->urg_seq - tp->copied_seq;
++ if (urg_offset < used) {
++ if (!urg_offset) { /* at urgent date */
++ if (!(sock_flag(sk, SOCK_URGINLINE))) {
++ tp->copied_seq++; /* discard the single byte of urgent data */
++ offset++;
++ used--;
++ }
++ } else /* truncate read */
++ used = urg_offset;
++ }
++ }
++
++ BUG_TRAP (used >= 0);
++ if (len < used)
++ used = len;
++
++ if (used == 0)
++ exhausted = 1;
++ else
++ {
++ if (skb_is_nonlinear (skb))
++ {
++ int rc = skb_linearize (skb, GFP_KERNEL);
++
++ printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++ if (rc)
++ {
++ if (!copied)
++ copied = rc;
++ break;
++ }
++ }
++
++ if ((offset + used) == skb->len) /* consuming the whole packet */
++ {
++ __skb_unlink (skb, &sk->sk_receive_queue);
++ dst_release (skb->dst);
++ skb_orphan (skb);
++ __skb_pull (skb, offset);
++ __skb_queue_tail (packets, skb);
++ exhausted = eaten = 1;
++ }
++ else /* consuming only part of the packet */
++ {
++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++ if (skb2 == NULL)
++ {
++ if (!copied)
++ copied = -ENOMEM;
++ break;
++ }
++
++ dst_release (skb2->dst);
++ __skb_pull (skb2, offset);
++ __skb_trim (skb2, used);
++ __skb_queue_tail (packets, skb2);
++ }
++
++ tp->copied_seq += used;
++ copied += used;
++ len -= used;
++ }
++
++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++ tp->urg_data = 0;
++ tcp_fast_path_check(sk, tp);
++ }
++
++ if (!exhausted)
++ continue;
++
++ if (skb->h.th->fin)
++ {
++ tp->copied_seq++;
++ if (!eaten)
++ sk_eat_skb (sk, skb);
++ break;
++ }
++
++ if (!eaten)
++ sk_eat_skb (sk, skb);
++
++ } while (len > 0);
++
++ out:
++ /* Clean up data we have read: This will do ACK frames. */
++ cleanup_rbuf(sk, copied);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return copied;
++}
+
+ /*
+ * State processing on a close. This implements the state shift for
+@@ -2872,6 +3118,8 @@
+ EXPORT_SYMBOL(tcp_recvmsg);
+ EXPORT_SYMBOL(tcp_sendmsg);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+ EXPORT_SYMBOL(tcp_setsockopt);
+ EXPORT_SYMBOL(tcp_shutdown);
+ EXPORT_SYMBOL(tcp_sockets_allocated);
+
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
+
-+ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
-+ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste */
++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))/* time on double mapping */
+ BUG ();
+
+#undef TCP_ZC_CSUM_FLAGS
--- /dev/null
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c
++++ linux-2.6.10/fs/exec.c
+@@ -124,9 +124,10 @@ asmlinkage long sys_uselib(const char __
+ struct file * file;
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_OPEN);
+
+- nd.intent.open.flags = FMODE_READ;
+- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++ error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ if (error)
+ goto out;
+
+@@ -138,7 +139,7 @@ asmlinkage long sys_uselib(const char __
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -485,8 +486,9 @@ struct file *open_exec(const char *name)
+ int err;
+ struct file *file;
+
+- nd.intent.open.flags = FMODE_READ;
+- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ intent_init(&nd.intent, IT_OPEN);
++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++ err = path_lookup(name, LOOKUP_FOLLOW, &nd);
+ file = ERR_PTR(err);
+
+ if (!err) {
+@@ -499,7 +501,7 @@ struct file *open_exec(const char *name)
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+Index: linux-2.6.10/fs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/inode.c
++++ linux-2.6.10/fs/inode.c
+@@ -233,6 +233,7 @@ void __iget(struct inode * inode)
+ inodes_stat.nr_unused--;
+ }
+
++EXPORT_SYMBOL(__iget);
+ /**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c
++++ linux-2.6.10/fs/namei.c
+@@ -288,8 +288,19 @@ int deny_write_access(struct file * file
+ return 0;
+ }
+
++void intent_release(struct lookup_intent *it)
++{
++ if (!it)
++ return;
++ if (it->it_magic != INTENT_MAGIC)
++ return;
++ if (it->it_op_release)
++ it->it_op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++ intent_release(&nd->intent);
+ dput(nd->dentry);
+ mntput(nd->mnt);
+ }
+@@ -379,7 +390,10 @@ static struct dentry * real_lookup(struc
+ {
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
++ int counter = 0;
+
++again:
++ counter++;
+ down(&dir->i_sem);
+ /*
+ * First re-do the cached lookup just in case it was created
+@@ -418,7 +432,10 @@ static struct dentry * real_lookup(struc
+ if (result->d_op && result->d_op->d_revalidate) {
+ if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+ dput(result);
+- result = ERR_PTR(-ENOENT);
++ if (counter > 10)
++ result = ERR_PTR(-ESTALE);
++ if (!IS_ERR(result))
++ goto again;
+ }
+ }
+ return result;
+@@ -448,7 +465,9 @@ walk_init_root(const char *name, struct
+ static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+ int res = 0;
++ struct lookup_intent it = nd->intent;
+ char *name;
++
+ if (IS_ERR(link))
+ goto fail;
+
+@@ -458,6 +477,9 @@ static inline int __vfs_follow_link(stru
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
++ intent_init(&nd->intent, it.it_op);
++ nd->intent.it_flags = it.it_flags;
++ nd->intent.it_create_mode = it.it_create_mode;
+ res = link_path_walk(link, nd);
+ out:
+ if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -666,6 +688,33 @@ fail:
+ return PTR_ERR(dentry);
+ }
+
++static int revalidate_special(struct nameidata *nd)
++{
++ struct dentry *dentry = nd->dentry;
++ int err, counter = 0;
++
++ revalidate_again:
++ if (!dentry->d_op || !dentry->d_op->d_revalidate)
++ return 0;
++ if (!dentry->d_op->d_revalidate(dentry, nd)) {
++ struct dentry *new;
++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd)))
++ return err;
++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd);
++ if (IS_ERR(new))
++ return PTR_ERR(new);
++ d_invalidate(dentry);
++ dput(dentry);
++ nd->dentry = dentry = new;
++ counter++;
++ if (counter < 10)
++ goto revalidate_again;
++ printk("excessive revalidate_it loops\n");
++ return -ESTALE;
++ }
++ return 0;
++}
++
+ /*
+ * Name resolution.
+ *
+@@ -767,8 +816,12 @@ int fastcall link_path_walk(const char *
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
++ int save_flags = nd->flags;
+ mntget(next.mnt);
++ nd->flags |= LOOKUP_LINK_NOTLAST;
+ err = do_follow_link(next.dentry, nd);
++ if (!(save_flags & LOOKUP_LINK_NOTLAST))
++ nd->flags &= ~LOOKUP_LINK_NOTLAST;
+ dput(next.dentry);
+ mntput(next.mnt);
+ if (err)
+@@ -807,14 +860,34 @@ last_component:
+ inode = nd->dentry->d_inode;
+ /* fallthrough */
+ case 1:
++ nd->flags |= LOOKUP_LAST;
++ err = revalidate_special(nd);
++ nd->flags &= ~LOOKUP_LAST;
++ if (!nd->dentry->d_inode)
++ err = -ENOENT;
++ if (err) {
++ path_release(nd);
++ goto return_err;
++ }
++ if (lookup_flags & LOOKUP_DIRECTORY) {
++ err = -ENOTDIR;
++ if (!nd->dentry->d_inode->i_op ||
++ !nd->dentry->d_inode->i_op->lookup){
++ path_release(nd);
++ goto return_err;
++ }
++ }
+ goto return_reval;
+ }
++
+ if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
+ err = nd->dentry->d_op->d_hash(nd->dentry, &this);
+ if (err < 0)
+ break;
+ }
++ nd->flags |= LOOKUP_LAST;
+ err = do_lookup(nd, &this, &next, atomic);
++ nd->flags &= ~LOOKUP_LAST;
+ if (err)
+ break;
+ follow_mount(&next.mnt, &next.dentry);
+@@ -1032,7 +1105,7 @@ struct dentry * lookup_hash(struct qstr
+ }
+
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+ unsigned long hash;
+ struct qstr this;
+@@ -1052,11 +1125,16 @@ struct dentry * lookup_one_len(const cha
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return __lookup_hash(&this, base, nd);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++ return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+ * namei()
+ *
+@@ -1068,7 +1146,7 @@ access:
+ * that namei follows links, while lnamei does not.
+ * SMP-safe
+ */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+@@ -1080,6 +1158,12 @@ int fastcall __user_walk(const char __us
+ return err;
+ }
+
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent, IT_LOOKUP);
++ return __user_walk_it(name, flags, nd);
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -1363,8 +1447,8 @@ int open_namei(const char * pathname, in
+ acc_mode |= MAY_APPEND;
+
+ /* Fill in the open() intent data */
+- nd->intent.open.flags = flag;
+- nd->intent.open.create_mode = mode;
++ nd->intent.it_flags = flag;
++ nd->intent.it_create_mode = mode;
+
+ /*
+ * The simplest case - just a plain lookup.
+@@ -1379,6 +1463,7 @@ int open_namei(const char * pathname, in
+ /*
+ * Create - we need to know the parent.
+ */
++ nd->intent.it_op |= IT_CREAT;
+ error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
+ if (error)
+ return error;
+@@ -1395,7 +1480,9 @@ int open_namei(const char * pathname, in
+ dir = nd->dentry;
+ nd->flags &= ~LOOKUP_PARENT;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+
+ do_last:
+ error = PTR_ERR(dentry);
+@@ -1508,7 +1595,9 @@ do_link:
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ putname(nd->last.name);
+ goto do_last;
+ }
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c
++++ linux-2.6.10/fs/namespace.c
+@@ -62,6 +62,7 @@ struct vfsmount *alloc_vfsmnt(const char
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_fslink);
++ INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+ if (name) {
+ int size = strlen(name)+1;
+ char *newname = kmalloc(size, GFP_KERNEL);
+@@ -113,6 +114,7 @@ static inline int check_mnt(struct vfsmo
+
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++ memset(old_nd, 0, sizeof(*old_nd));
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
+ mnt->mnt_parent = mnt;
+@@ -176,6 +178,9 @@ void __mntput(struct vfsmount *mnt)
+ {
+ struct super_block *sb = mnt->mnt_sb;
+ dput(mnt->mnt_root);
++ spin_lock(&dcache_lock);
++ list_del(&mnt->mnt_lustre_list);
++ spin_unlock(&dcache_lock);
+ free_vfsmnt(mnt);
+ deactivate_super(sb);
+ }
+@@ -402,6 +407,8 @@ static int do_umount(struct vfsmount *mn
+ */
+
+ lock_kernel();
++ if (sb->s_op->umount_lustre)
++ sb->s_op->umount_lustre(sb);
+ if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
+ sb->s_op->umount_begin(sb);
+ unlock_kernel();
+@@ -627,6 +634,7 @@ static int do_loopback(struct nameidata
+ return err;
+ if (!old_name || !*old_name)
+ return -EINVAL;
++ intent_init(&old_nd.intent, IT_LOOKUP);
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ if (err)
+ return err;
+@@ -701,6 +709,7 @@ static int do_move_mount(struct nameidat
+ return -EPERM;
+ if (!old_name || !*old_name)
+ return -EINVAL;
++ intent_init(&old_nd.intent, IT_LOOKUP);
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ if (err)
+ return err;
+@@ -1012,6 +1021,7 @@ long do_mount(char * dev_name, char * di
+ int retval = 0;
+ int mnt_flags = 0;
+
++ intent_init(&nd.intent, IT_LOOKUP);
+ /* Discard magic */
+ if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
+ flags &= ~MS_MGC_MSK;
+Index: linux-2.6.10/fs/open.c
+===================================================================
+--- linux-2.6.10.orig/fs/open.c
++++ linux-2.6.10/fs/open.c
+@@ -216,12 +216,12 @@ static inline long do_sys_truncate(const
+ struct nameidata nd;
+ struct inode * inode;
+ int error;
+-
++ intent_init(&nd.intent, IT_GETATTR);
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -475,6 +475,7 @@ asmlinkage long sys_access(const char __
+ int old_fsuid, old_fsgid;
+ kernel_cap_t old_cap;
+ int res;
++ intent_init(&nd.intent, IT_GETATTR);
+
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+@@ -499,13 +500,14 @@ asmlinkage long sys_access(const char __
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode, &nd);
+ /* SuS v2 requires we report a read only fs too */
+ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+ && !special_file(nd.dentry->d_inode->i_mode))
+ res = -EROFS;
++
+ path_release(&nd);
+ }
+
+@@ -520,8 +522,9 @@ asmlinkage long sys_chdir(const char __u
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ if (error)
+ goto out;
+
+@@ -573,8 +576,9 @@ asmlinkage long sys_chroot(const char __
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ if (error)
+ goto out;
+
+@@ -758,8 +762,10 @@ asmlinkage long sys_fchown(unsigned int
+ struct file *filp_open(const char * filename, int flags, int mode)
+ {
+ int namei_flags, error;
++ struct file * temp_filp;
+ struct nameidata nd;
+
++ intent_init(&nd.intent, IT_OPEN);
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+ namei_flags++;
+@@ -767,15 +773,26 @@ struct file *filp_open(const char * file
+ namei_flags |= 2;
+
+ error = open_namei(filename, namei_flags, mode, &nd);
+- if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
+-
++ if (!error) {
++ temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent);
++ return temp_filp;
++ }
+ return ERR_PTR(error);
+ }
+
+-EXPORT_SYMBOL(filp_open);
+
+ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++ {
++
++ struct lookup_intent it;
++ intent_init(&it, IT_LOOKUP);
++
++ return dentry_open_it(dentry, mnt, flags, &it);
++}
++
++EXPORT_SYMBOL(dentry_open);
++
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags,struct lookup_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -787,6 +805,7 @@ struct file *dentry_open(struct dentry *
+ goto cleanup_dentry;
+ f->f_flags = flags;
+ f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
++ f->f_it = it;
+ inode = dentry->d_inode;
+ if (f->f_mode & FMODE_WRITE) {
+ error = get_write_access(inode);
+@@ -805,6 +824,7 @@ struct file *dentry_open(struct dentry *
+ error = f->f_op->open(inode,f);
+ if (error)
+ goto cleanup_all;
++ intent_release(it);
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+@@ -830,13 +850,12 @@ cleanup_all:
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+ }
+
+-EXPORT_SYMBOL(dentry_open);
+-
+ /*
+ * Find an empty file descriptor entry, and mark it busy.
+ */
+Index: linux-2.6.10/fs/stat.c
+===================================================================
+--- linux-2.6.10.orig/fs/stat.c
++++ linux-2.6.10/fs/stat.c
+@@ -38,7 +38,7 @@ void generic_fillattr(struct inode *inod
+
+ EXPORT_SYMBOL(generic_fillattr);
+
+-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat)
+ {
+ struct inode *inode = dentry->d_inode;
+ int retval;
+@@ -47,6 +47,8 @@ int vfs_getattr(struct vfsmount *mnt, st
+ if (retval)
+ return retval;
+
++ if (inode->i_op->getattr_it)
++ return inode->i_op->getattr_it(mnt, dentry, it, stat);
+ if (inode->i_op->getattr)
+ return inode->i_op->getattr(mnt, dentry, stat);
+
+@@ -63,14 +65,20 @@ int vfs_getattr(struct vfsmount *mnt, st
+
+ EXPORT_SYMBOL(vfs_getattr);
+
++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++{
++ return vfs_getattr_it(mnt, dentry, NULL, stat);
++}
++
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = user_path_walk(name, &nd);
++ error = user_path_walk_it(name, &nd);
+ if (!error) {
+- error = vfs_getattr(nd.mnt, nd.dentry, stat);
++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+ path_release(&nd);
+ }
+ return error;
+@@ -82,10 +90,11 @@ int vfs_lstat(char __user *name, struct
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = user_path_walk_link(name, &nd);
++ error = user_path_walk_link_it(name, &nd);
+ if (!error) {
+- error = vfs_getattr(nd.mnt, nd.dentry, stat);
++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+ path_release(&nd);
+ }
+ return error;
+@@ -97,9 +106,12 @@ int vfs_fstat(unsigned int fd, struct ks
+ {
+ struct file *f = fget(fd);
+ int error = -EBADF;
++ struct nameidata nd;
++ intent_init(&nd.intent, IT_GETATTR);
+
+ if (f) {
+- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat);
++ intent_release(&nd.intent);
+ fput(f);
+ }
+ return error;
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h
++++ linux-2.6.10/include/linux/dcache.h
+@@ -4,6 +4,7 @@
+ #ifdef __KERNEL__
+
+ #include <asm/atomic.h>
++#include <linux/string.h>
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
+ #include <linux/cache.h>
+@@ -37,6 +38,8 @@ struct qstr {
+ const unsigned char *name;
+ };
+
++#include <linux/namei.h>
++
+ struct dentry_stat_t {
+ int nr_dentry;
+ int nr_unused;
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h
++++ linux-2.6.10/include/linux/fs.h
+@@ -78,6 +78,7 @@ extern int dir_notify_enable;
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+
+ /* Internal kernel extensions */
+ #define FMODE_LSEEK 4
+@@ -262,6 +263,8 @@ typedef void (dio_iodone_t)(struct inode
+ #define ATTR_ATTR_FLAG 1024
+ #define ATTR_KILL_SUID 2048
+ #define ATTR_KILL_SGID 4096
++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+@@ -465,6 +468,7 @@ struct inode {
+ struct block_device *i_bdev;
+ struct cdev *i_cdev;
+ int i_cindex;
++ void *i_filterdata;
+
+ __u32 i_generation;
+
+@@ -600,6 +604,7 @@ struct file {
+ spinlock_t f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
++ struct lookup_intent *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -950,7 +955,9 @@ struct inode_operations {
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int, struct nameidata *);
+ int (*setattr) (struct dentry *, struct iattr *);
++ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+@@ -990,6 +997,7 @@ struct super_operations {
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
++ void (*umount_lustre) (struct super_block *);
+
+ int (*show_options)(struct seq_file *, struct vfsmount *);
+ };
+@@ -1181,6 +1189,7 @@ extern int unregister_filesystem(struct
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+@@ -1245,6 +1254,7 @@ static inline int break_lease(struct ino
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h
++++ linux-2.6.10/include/linux/mount.h
+@@ -36,6 +36,8 @@ struct vfsmount
+ struct list_head mnt_list;
+ struct list_head mnt_fslink; /* link in fs-specific expiry list */
+ struct namespace *mnt_namespace; /* containing namespace */
++ struct list_head mnt_lustre_list; /* GNS mount list */
++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */
+ };
+
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h
++++ linux-2.6.10/include/linux/namei.h
+@@ -2,14 +2,48 @@
+ #define _LINUX_NAMEI_H
+
+ #include <linux/linkage.h>
++#include <linux/string.h>
+
+ struct vfsmount;
++struct nameidata;
+
+-struct open_intent {
+- int flags;
+- int create_mode;
++/* intent opcodes */
++#define IT_OPEN (1)
++#define IT_CREAT (1<<1)
++#define IT_READDIR (1<<2)
++#define IT_GETATTR (1<<3)
++#define IT_LOOKUP (1<<4)
++#define IT_UNLINK (1<<5)
++#define IT_TRUNC (1<<6)
++#define IT_GETXATTR (1<<7)
++
++struct lustre_intent_data {
++ int it_disposition;
++ int it_status;
++ __u64 it_lock_handle;
++ void *it_data;
++ int it_lock_mode;
+ };
+
++#define INTENT_MAGIC 0x19620323
++struct lookup_intent {
++ int it_magic;
++ void (*it_op_release)(struct lookup_intent *);
++ int it_op;
++ int it_flags;
++ int it_create_mode;
++ union {
++ struct lustre_intent_data lustre;
++ } d;
++};
++
++static inline void intent_init(struct lookup_intent *it, int op)
++{
++ memset(it, 0, sizeof(*it));
++ it->it_magic = INTENT_MAGIC;
++ it->it_op = op;
++}
++
+ enum { MAX_NESTED_LINKS = 8 };
+
+ struct nameidata {
+@@ -21,10 +55,7 @@ struct nameidata {
+ unsigned depth;
+ char *saved_names[MAX_NESTED_LINKS + 1];
+
+- /* Intent data */
+- union {
+- struct open_intent open;
+- } intent;
++ struct lookup_intent intent;
+ };
+
+ /*
+@@ -47,6 +78,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
+ #define LOOKUP_NOALT 32
+ #define LOOKUP_ATOMIC 64
+ #define LOOKUP_REVAL 128
++#define LOOKUP_LAST (0x1000)
++#define LOOKUP_LINK_NOTLAST (0x2000)
+
+ /*
+ * Intent data
+@@ -56,6 +89,12 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
+ #define LOOKUP_ACCESS (0x0400)
+
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd));
++#define user_path_walk_it(name,nd) \
++ __user_walk_it(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_link_it(name,nd) \
++ __user_walk_it(name, 0, nd)
++extern void intent_release(struct lookup_intent *);
+ #define user_path_walk(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+@@ -68,7 +107,6 @@ extern void path_release_on_umount(struc
+
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+-
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+
fput(f);
}
-Index: linux-2.6.5-12.1/fs/nfs/dir.c
-===================================================================
---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400
-+++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-06-03 18:31:28.000000000 -0400
-@@ -709,7 +709,7 @@
- return 0;
- if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
- return 0;
-- return (nd->intent.open.flags & O_EXCL) != 0;
-+ return (nd->intent.it_flags & O_EXCL) != 0;
- }
-
- static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
-@@ -1026,7 +1026,7 @@
- attr.ia_valid = ATTR_MODE;
-
- if (nd && (nd->flags & LOOKUP_CREATE))
-- open_flags = nd->intent.open.flags;
-+ open_flags = nd->intent.it_flags;
-
- /*
- * The 0 argument passed into the create function should one day
Index: linux-2.6.5-12.1/fs/inode.c
===================================================================
--- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 12:21:56.000000000 -0400
fput(f);
}
return error;
-Index: linux-2.6.12.5/fs/nfs/dir.c
-===================================================================
---- linux-2.6.12.5.orig/fs/nfs/dir.c 2005-08-17 17:51:28.000000000 +0200
-+++ linux-2.6.12.5/fs/nfs/dir.c 2005-08-17 17:51:44.000000000 +0200
-@@ -727,7 +727,7 @@
- return 0;
- if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
- return 0;
-- return (nd->intent.open.flags & O_EXCL) != 0;
-+ return (nd->intent.it_flags & O_EXCL) != 0;
- }
-
- static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
-@@ -1028,7 +1028,7 @@
- attr.ia_valid = ATTR_MODE;
-
- if (nd && (nd->flags & LOOKUP_CREATE))
-- open_flags = nd->intent.open.flags;
-+ open_flags = nd->intent.it_flags;
-
- lock_kernel();
- nfs_begin_data_update(dir);
Index: linux-2.6.12.5/fs/inode.c
===================================================================
--- linux-2.6.12.5.orig/fs/inode.c 2005-08-17 17:51:28.000000000 +0200
uml-2.6.10-fc3.patch
lustre_version.patch
fc3_to_rhel4_updates.patch
-vfs_intent-2.6-rhel4.patch
+vfs_intent-2.6-fc3.patch
vfs_nointent-2.6-rhel4.patch
vfs_races-2.6-fc3.patch
ext3-wantedi-misc-2.6-suse.patch
compile-fixes-2.6.9-rhel4-22.patch
vm-tunables-rhel4.patch
tcp-zero-copy-2.6.9-rhel4.patch
+iallocsem_consistency.patch
qsnet-suse-2.6.patch
fsprivate-2.6.patch
dcache-qstr-api-fix-2.6-suse.patch
+iallocsem_consistency.patch
+tcp-zero-copy-2.6.5-7.244.patch
md_path_lookup-2.6-suse.patch
ext3-super-ntohl.patch
export-show_task-2.6-vanilla.patch
-export-filemap_populate.patch
sd_iostats-2.6-rhel4.patch
fsprivate-2.6.patch
export_symbol_numa.patch
+tcp-zero-copy-2.6.12.6.patch
int namelen = strlen(name);
/* remove the stale test quotafile */
- down(&parent_inode->i_sem);
+ LOCK_INODE_MUTEX(parent_inode);
de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
if (!IS_ERR(de) && de->d_inode)
vfs_unlink(parent_inode, de);
if (!IS_ERR(de))
dput(de);
- up(&parent_inode->i_sem);
+ UNLOCK_INODE_MUTEX(parent_inode);
/* create quota file */
fp = filp_open(name, O_CREAT | O_EXCL, 0644);
filp_close(lqi->qi_files[i], 0);
/* unlink quota file */
- down(&parent_inode->i_sem);
+ LOCK_INODE_MUTEX(parent_inode);
de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
if (IS_ERR(de) || de->d_inode == NULL) {
dput:
if (!IS_ERR(de))
dput(de);
- up(&parent_inode->i_sem);
+ UNLOCK_INODE_MUTEX(parent_inode);
}
pop_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
ptlrpc_init_client(rq_portal, rp_portal, name,
&obddev->obd_ldlm_client);
- imp = class_new_import();
+ imp = class_new_import(obddev);
if (imp == NULL)
GOTO(err_ldlm, rc = -ENOENT);
imp->imp_client = &obddev->obd_ldlm_client;
- imp->imp_obd = obddev;
imp->imp_connect_op = connect_op;
- imp->imp_generation = 0;
imp->imp_initial_recov = 1;
CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
- memcpy(imp->imp_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+ memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
LUSTRE_CFG_BUFLEN(lcfg, 1));
class_import_put(imp);
if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
name, obddev->obd_name,
- imp->imp_target_uuid.uuid);
+ cli->cl_target_uuid.uuid);
imp->imp_invalid = 1;
}
}
int client_obd_cleanup(struct obd_device *obddev)
{
- struct client_obd *cli = &obddev->u.cli;
-
ENTRY;
- if (!cli->cl_import)
- RETURN(-EINVAL);
- class_destroy_import(cli->cl_import);
- cli->cl_import = NULL;
- client_obd_list_lock_done(&cli->cl_loi_list_lock);
-
ldlm_put_ref(obddev->obd_force);
RETURN(0);
}
/* Yeah, obd_no_recov also (mainly) means "forced shutdown". */
- if (obd->obd_no_recov)
- ptlrpc_invalidate_import(imp);
- else
+ if (!obd->obd_no_recov)
rc = ptlrpc_disconnect_import(imp);
+ ptlrpc_invalidate_import(imp);
+ imp->imp_deactive = 1;
+ ptlrpc_free_rq_pool(imp->imp_rq_pool);
+ class_destroy_import(imp);
+ cli->cl_import = NULL;
+
EXIT;
out_no_disconnect:
err = class_disconnect(exp);
if (export->exp_imp_reverse != NULL)
class_destroy_import(export->exp_imp_reverse);
- revimp = export->exp_imp_reverse = class_new_import();
+ revimp = export->exp_imp_reverse = class_new_import(target);
revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection);
revimp->imp_client = &export->exp_obd->obd_ldlm_client;
revimp->imp_remote_handle = conn;
- revimp->imp_obd = target;
revimp->imp_dlm_fake = 1;
revimp->imp_state = LUSTRE_IMP_FULL;
class_import_put(revimp);
struct l_wait_info lwi = { 0 };
ENTRY;
- lock_kernel();
cfs_daemonize("ldlm_elt");
- cfs_block_allsigs();
-
- unlock_kernel();
expired_lock_thread.elt_state = ELT_READY;
cfs_waitq_signal(&expired_lock_thread.elt_waitq);
{
struct ldlm_lock *lock, *last = NULL;
- if (obd_dump_on_timeout)
- libcfs_debug_dumplog();
-
spin_lock_bh(&waiting_locks_spinlock);
while (!list_empty(&waiting_locks_list)) {
lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
CFS_INIT_LIST_HEAD(&waiting_locks_list); /* HACK */
expired_lock_thread.elt_dump = __LINE__;
- spin_unlock_bh(&waiting_locks_spinlock);
/* LBUG(); */
CEMERG("would be an LBUG, but isn't (bug 5653)\n");
list_del(&lock->l_pending_chain);
list_add(&lock->l_pending_chain,
&expired_lock_thread.elt_expired_locks);
+ }
+
+ if (!list_empty(&expired_lock_thread.elt_expired_locks)) {
+ if (obd_dump_on_timeout)
+ expired_lock_thread.elt_dump = __LINE__;
cfs_waitq_signal(&expired_lock_thread.elt_waitq);
}
LASSERT(lock != NULL);
do_gettimeofday(&granted_time);
- total_enqueue_wait = cfs_timeval_sub(&granted_time,&lock->l_enqueued_time, NULL);
+ total_enqueue_wait = cfs_timeval_sub(&granted_time,
+ &lock->l_enqueued_time, NULL);
if (total_enqueue_wait / 1000000 > obd_timeout)
LDLM_ERROR(lock, "enqueue wait took %luus from %lu",
struct ldlm_bl_pool *blp = bltd->bltd_blp;
ENTRY;
- /* XXX boiler-plate */
{
char name[CFS_CURPROC_COMM_MAX];
snprintf(name, sizeof(name) - 1, "ldlm_bl_%02d",
bltd->bltd_num);
cfs_daemonize(name);
}
- cfs_block_allsigs();
atomic_inc(&blp->blp_num_threads);
complete(&blp->blp_comp);
struct lock_wait_data {
struct ldlm_lock *lwd_lock;
- int lwd_generation;
+ __u32 lwd_conn_cnt;
};
int ldlm_expired_completion_wait(void *data)
obd = lock->l_conn_export->exp_obd;
imp = obd->u.cli.cl_import;
- ptlrpc_fail_import(imp, lwd->lwd_generation);
+ ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
LDLM_ERROR(lock, "lock timed out (enqueued %lus ago), entering "
"recovery for %s@%s", lock->l_enqueued_time.tv_sec,
- imp->imp_target_uuid.uuid,
- imp->imp_connection->c_remote_uuid.uuid);
+ obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
RETURN(0);
}
lwd.lwd_lock = lock;
if (unlikely(flags & LDLM_FL_NO_TIMEOUT)) {
- LDLM_DEBUG(lock, "waiting indefinitely because CW lock was"
- " met\n");
+ LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
lwi = LWI_INTR(interrupted_completion_wait, &lwd);
} else {
lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
if (imp != NULL) {
spin_lock_irqsave(&imp->imp_lock, irqflags);
- lwd.lwd_generation = imp->imp_generation;
+ lwd.lwd_conn_cnt = imp->imp_conn_cnt;
spin_unlock_irqrestore(&imp->imp_lock, irqflags);
}
/* lock enqueued on the server */
cleanup_phase = 1;
+ l_lock(&ns->ns_lock);
lock->l_remote_handle = reply->lock_handle;
*flags = reply->lock_flags;
lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
+ l_unlock(&ns->ns_lock);
CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%x\n",
lock, reply->lock_handle.cookie, *flags);
/* we use l_pending_chain here, because it's unused on clients. */
LASSERTF(list_empty(&lock->l_pending_chain),"lock %p next %p prev %p\n",
lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
- list_add(&lock->l_pending_chain, list);
+ /* bug 9573: don't replay locks left after eviction */
+ if (!(lock->l_flags & LDLM_FL_FAILED))
+ list_add(&lock->l_pending_chain, list);
return LDLM_ITER_CONTINUE;
}
ocd->ocd_version = LUSTRE_VERSION_CODE;
/* Disable initial recovery on this import */
- rc = obd_set_info(obd->obd_self_export,
- strlen("initial_recov"), "initial_recov",
- sizeof(allow_recov), &allow_recov);
+ rc = obd_set_info_async(obd->obd_self_export,
+ strlen("initial_recov"), "initial_recov",
+ sizeof(allow_recov), &allow_recov, NULL);
rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd);
if (rc) {
{
int err;
char *timeout = NULL;
- char *debug_mask = NULL;
- char *debug_subsys = NULL;
#ifndef INIT_SYSIO
extern void __liblustre_cleanup_(void);
#endif
-#if 0
- libcfs_debug = -1;
- libcfs_subsystem_debug = -1;
-#endif
-
liblustre_init_random();
err = lllib_init();
obd_timeout);
}
- /* debug masks */
- debug_mask = getenv("LIBLUSTRE_DEBUG_MASK");
- if (debug_mask)
- libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0);
-
- debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS");
- if (debug_subsys)
- libcfs_subsystem_debug =
- (unsigned int) strtol(debug_subsys, NULL, 0);
-
#ifndef INIT_SYSIO
(void)atexit(__liblustre_cleanup_);
#endif
struct intnl_stat *st;
ENTRY;
+ if (it_disposition(it, DISP_OPEN_CREATE))
+ ptlrpc_req_finished(request);
+
rc = mdc_req2lustre_md(request, offset, sbi->ll_osc_exp, &md);
if (rc)
RETURN(rc);
}
if (mask & SETATTR_MTIME) {
iattr.ia_mtime = stbuf->st_mtime;
- iattr.ia_valid |= ATTR_MTIME;
+ iattr.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
}
if (mask & SETATTR_ATIME) {
iattr.ia_atime = stbuf->st_atime;
- iattr.ia_valid |= ATTR_ATIME;
+ iattr.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
}
if (mask & SETATTR_UID) {
iattr.ia_uid = stbuf->st_uid;
CERROR("MDC %s: not setup or attached\n", mdc);
GOTO(out_free, err = -EINVAL);
}
- obd_set_info(obd->obd_self_export, strlen("async"), "async",
- sizeof(async), &async);
+ obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
+ sizeof(async), &async, NULL);
ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION;
ocd.ocd_ibits_known = MDS_INODELOCK_FULL;
CERROR("OSC %s: not setup or attached\n", osc);
GOTO(out_mdc, err = -EINVAL);
}
- obd_set_info(obd->obd_self_export, strlen("async"), "async",
- sizeof(async), &async);
+ obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
+ sizeof(async), &async, NULL);
obd->obd_upcall.onu_owner = &sbi->ll_lco;
obd->obd_upcall.onu_upcall = ll_ocd_update;
void *buf_alloc;
int buf_size;
int opt_verbose;
+struct timeval start;
extern char *lustre_path;
buf[80] = 0; \
} \
printf("%s", buf); \
+ gettimeofday(&start, NULL); \
} while (0)
#define LEAVE() \
do { \
- char buf[100]; \
- int len; \
- sprintf(buf, "===== END TEST %s: successfully ", \
- __FUNCTION__); \
- len = strlen(buf); \
+ struct timeval stop; \
+ char buf[100] = { '\0' }; \
+ int len = sizeof(buf) - 1; \
+ long usec; \
+ gettimeofday(&stop, NULL); \
+ usec = (stop.tv_sec - start.tv_sec) * 1000000 + \
+ (stop.tv_usec - start.tv_usec); \
+ len = snprintf(buf, len, \
+ "===== END TEST %s: successfully (%gs)", \
+ __FUNCTION__, (double)usec / 1000000); \
if (len < 79) { \
- memset(buf+len, '=', 100-len); \
+ memset(buf+len, '=', sizeof(buf) - len); \
buf[79] = '\n'; \
buf[80] = 0; \
} \
LEAVE();
}
+#define NEW_TIME 10000
+int t53(char *name)
+{
+ char file[MAX_PATH_LENGTH] = "";
+ struct utimbuf times; /* struct. buffer for utime() */
+ struct stat stat_buf; /* struct buffer to hold file info. */
+ time_t mtime, atime;
+
+ ENTRY("mtime/atime should be updated by utime() call");
+ snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path);
+
+ t_echo_create(file, "check mtime/atime update by utime() call");
+
+ /* Initialize the modification and access time in the times arg */
+ times.actime = NEW_TIME+10;
+ times.modtime = NEW_TIME;
+
+ /* file modification/access time */
+ utime(file, ×);
+
+ if (stat(file, &stat_buf) < 0) {
+ printf("stat(2) of %s failed, error:%d %s\n",
+ file, errno, strerror(errno));
+ }
+ mtime = stat_buf.st_mtime;
+ atime = stat_buf.st_atime;
+
+ if ((mtime == NEW_TIME) && (atime == NEW_TIME + 10)) {
+ t_unlink(file);
+ LEAVE();
+ }
+
+ printf("mod time %ld, expected %ld\n", mtime, (long)NEW_TIME);
+ printf("acc time %ld, expected %ld\n", atime, (long)NEW_TIME + 10);
+
+ t_unlink(file);
+ return (-1);
+}
+
+int t54(char *name)
+{
+ char file[MAX_PATH_LENGTH] = "";
+ struct flock lock;
+ int fd, err;
+
+ ENTRY("fcntl should return 0 when succeed in getting flock");
+ snprintf(file, MAX_PATH_LENGTH, "%s/test_t54_file", lustre_path);
+
+ t_echo_create(file, "fcntl should return 0 when succeed");
+
+ fd = open(file, O_RDWR);
+ if (fd < 0) {
+ printf("\nerror open file: %s\n", strerror(errno));
+ return(-1);
+ }
+ lock.l_type = F_WRLCK;
+ lock.l_start = 0;
+ lock.l_whence = 0;
+ lock.l_len = 1;
+ if ((err = t_fcntl(fd, F_SETLKW, &lock)) != 0) {
+ fprintf(stderr, "fcntl returned: %d (%s)\n",
+ err, strerror(err));
+ close(fd);
+ t_unlink(file);
+ return (-1);
+ }
+
+ lock.l_type = F_UNLCK;
+ t_fcntl(fd, F_SETLKW, &lock);
+ close(fd);
+ t_unlink(file);
+ LEAVE();
+}
+
extern void __liblustre_setup_(void);
extern void __liblustre_cleanup_(void);
void usage(char *cmd)
{
- printf("\n");
- printf("Usage: \t%s --target mdsnid:/mdsname/profile\n", cmd);
- printf(" \t%s --dumpfile dumpfile\n", cmd);
+ printf("\n"
+ "usage: %s [--only {test}] --target mdsnid:/mdsname/profile\n",
+ cmd);
+ printf(" %s --dumpfile dumpfile\n", cmd);
exit(-1);
}
{ t50, "50" },
{ t50b, "50b" },
{ t51, "51" },
+ { t53, "53" },
+ { t54, "54" },
{ NULL, NULL }
};
run = 0;
len = strlen(test->name);
for (i = 0; i < numonly; i++) {
- if (len < strlen(only[i]))
+ int olen = strlen(only[i]);
+
+ if (len < olen)
continue;
- if (strncmp(only[i], test->name,
- strlen(only[i])) == 0) {
- run = 1;
- break;
+
+ if (strncmp(only[i], test->name, olen) == 0) {
+ switch(test->name[olen]) {
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9':
+ break;
+ default:
+ run = 1;
+ break;
+ }
}
}
}
MODULES := llite
-llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o xattr.o
+llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o symlink.o llite_mmap.o xattr.o
ifeq ($(PATCHLEVEL),4)
llite-objs += rw24.o super.o
void ll_unhash_aliases(struct inode *inode)
{
struct list_head *tmp, *head;
- struct ll_sb_info *sbi;
ENTRY;
if (inode == NULL) {
CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
inode->i_ino, inode->i_generation, inode);
- sbi = ll_i2sbi(inode);
head = &inode->i_dentry;
restart:
spin_lock(&dcache_lock);
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
__d_drop(dentry);
hlist_add_head(&dentry->d_hash,
- &sbi->ll_orphan_dentry_list);
+ &ll_i2sbi(inode)->ll_orphan_dentry_list);
#endif
}
unlock_dentry(dentry);
struct lookup_intent *it,
struct dentry *de)
{
- struct ll_sb_info *sbi;
int rc = 0;
ENTRY;
if (it_disposition(it, DISP_LOOKUP_NEG))
RETURN(-ENOENT);
- sbi = ll_i2sbi(de->d_inode);
- rc = ll_prep_inode(sbi->ll_osc_exp, &de->d_inode, request, offset,NULL);
+ rc = ll_prep_inode(ll_i2sbi(de->d_inode)->ll_osc_exp, &de->d_inode,
+ request, offset,NULL);
RETURN(rc);
}
ll_intent_release(it);
GOTO(out, rc = 0);
}
+ if ((it->it_op & IT_OPEN) && de->d_inode &&
+ !S_ISREG(de->d_inode->i_mode) &&
+ !S_ISDIR(de->d_inode->i_mode)) {
+ ll_release_openhandle(de, it);
+ }
rc = 1;
/* unfortunately ll_intent_lock may cause a callback and revoke our
}
-static void ext2_check_page(struct page *page)
+static void ext2_check_page(struct inode *dir, struct page *page)
{
- struct inode *dir = page->mapping->host;
unsigned chunk_size = ext2_chunk_size(dir);
char *kaddr = page_address(page);
// u32 max_inumber = le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count);
/* Too bad, we had an error */
Ebadsize:
- CERROR("ext2_check_page"
- "size of directory #%lu is not a multiple of chunk size\n",
- dir->i_ino
- );
+ CERROR("%s: directory %lu/%u size %llu is not a multiple of %u\n",
+ ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
+ dir->i_generation, dir->i_size, chunk_size);
goto fail;
Eshort:
error = "rec_len is smaller than minimal";
//Einumber:
// error = "inode out of bounds";
bad_entry:
- CERROR("ext2_check_page: bad entry in directory #%lu: %s - "
+ CERROR("%s: bad entry in directory %lu/%u: %s - "
"offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT), offs,
- (unsigned long) le32_to_cpu(p->inode),
+ ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
+ dir->i_generation, error, (page->index<<PAGE_CACHE_SHIFT), offs,
+ (unsigned long)le32_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
page = read_cache_page(mapping, n,
(filler_t*)mapping->a_ops->readpage, NULL);
- if (!IS_ERR(page)) {
- wait_on_page(page);
- (void)kmap(page);
- if (!PageUptodate(page))
- goto fail;
- if (!PageChecked(page))
- ext2_check_page(page);
- if (PageError(page))
- goto fail;
- }
+ if (IS_ERR(page))
+ GOTO(out_unlock, page);
+
+ wait_on_page(page);
+ (void)kmap(page);
+ if (!PageUptodate(page))
+ goto fail;
+ if (!PageChecked(page))
+ ext2_check_page(dir, page);
+ if (PageError(page))
+ goto fail;
out_unlock:
ldlm_lock_decref(&lockh, LCK_CR);
};
-int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ll_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct inode *inode = filp->f_dentry->d_inode;
loff_t pos = filp->f_pos;
kaddr = page_address(page);
if (need_revalidate) {
+ /* page already checked from ll_get_dir_page() */
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
need_revalidate = 0;
}
done:
filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
filp->f_version = inode->i_version;
- update_atime(inode);
+ touch_atime(filp->f_vfsmnt, filp->f_dentry);
+
RETURN(rc);
}
/* XXX: dqb_valid is borrowed as a flag to mark that
* only mds quota is wanted */
if (qctl->qc_dqblk.dqb_valid)
- qctl->obd_uuid =
- sbi->ll_mdc_exp->exp_obd->u.cli.
- cl_import->imp_target_uuid;
+ qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
+ u.cli.cl_target_uuid;
break;
case Q_GETINFO:
break;
OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
}
-int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
- struct file *file)
+static int ll_close_inode_openhandle(struct inode *inode,
+ struct obd_client_handle *och)
{
- struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
struct ptlrpc_request *req = NULL;
- struct obd_client_handle *och = &fd->fd_mds_och;
- struct obdo obdo;
+ struct obdo *oa;
int rc;
- ENTRY;
- /* clear group lock, if present */
- if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
- struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
- fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
- rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
- &fd->fd_cwlockh);
- }
-
- obdo.o_id = inode->i_ino;
- obdo.o_valid = OBD_MD_FLID;
- obdo_from_inode(&obdo, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
- OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
- OBD_MD_FLATIME | OBD_MD_FLMTIME |
- OBD_MD_FLCTIME);
+ oa = obdo_alloc();
+ if (!oa)
+ RETURN(-ENOMEM); // XXX We leak openhandle and request here.
+
+ oa->o_id = inode->i_ino;
+ oa->o_valid = OBD_MD_FLID;
+ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
+ OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+ OBD_MD_FLATIME | OBD_MD_FLMTIME |
+ OBD_MD_FLCTIME);
if (0 /* ll_is_inode_dirty(inode) */) {
- obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
- obdo.o_valid |= OBD_MD_FLFLAGS;
+ oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
+ oa->o_valid |= OBD_MD_FLFLAGS;
}
- rc = mdc_close(mdc_exp, &obdo, och, &req);
+
+ rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
if (rc == EAGAIN) {
/* We are the last writer, so the MDS has instructed us to get
* the file size and any write cookies, then close again. */
CERROR("inode %lu mdc close failed: rc = %d\n",
inode->i_ino, rc);
}
+
+ obdo_free(oa);
+
if (rc == 0) {
- rc = ll_objects_destroy(req, file->f_dentry->d_inode);
+ rc = ll_objects_destroy(req, inode);
if (rc)
CERROR("inode %lu ll_objects destroy: rc = %d\n",
inode->i_ino, rc);
}
mdc_clear_open_replay_data(och);
- ptlrpc_req_finished(req);
+ ptlrpc_req_finished(req); /* This is close request */
+
+ RETURN(rc);
+}
+
+int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
+ struct file *file)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct obd_client_handle *och = &fd->fd_mds_och;
+ int rc;
+ ENTRY;
+
+ /* clear group lock, if present */
+ if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+ fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
+ rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
+ &fd->fd_cwlockh);
+ }
+
+ rc = ll_close_inode_openhandle(inode, och);
och->och_fh.cookie = DEAD_HANDLE_MAGIC;
LUSTRE_FPRIVATE(file) = NULL;
ll_file_data_put(fd);
rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, itp, LCK_PW, &data,
&lockh, lmm, lmmsize, ldlm_completion_ast,
ll_mdc_blocking_ast, NULL, 0);
- if (rc < 0)
+ if (rc < 0) {
CERROR("lock enqueue: err: %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
+ (struct ptlrpc_request *)itp->d.lustre.it_data, 1,
+ NULL);
+out:
RETURN(rc);
}
-int ll_local_open(struct file *file, struct lookup_intent *it,
- struct ll_file_data *fd)
+static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
+ struct obd_client_handle *och)
{
struct ptlrpc_request *req = it->d.lustre.it_data;
- struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
struct mds_body *body;
- ENTRY;
- body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
- LASSERT (body != NULL); /* reply already checked out */
- LASSERT_REPSWABBED (req, 1); /* and swabbed down */
+ LASSERT(och);
+
+ body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
+ LASSERT(body != NULL); /* reply already checked out */
+ LASSERT_REPSWABBED(req, 1); /* and swabbed in mdc_enqueue */
+
+ memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+ och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+ lli->lli_io_epoch = body->io_epoch;
+
+ mdc_set_open_replay_data(och, it->d.lustre.it_data);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+ struct ll_file_data *fd)
+{
+ ENTRY;
LASSERT(!LUSTRE_FPRIVATE(file));
LASSERT(fd != NULL);
- memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
- fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
+ ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, &fd->fd_mds_och);
LUSTRE_FPRIVATE(file) = fd;
ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
- lli->lli_io_epoch = body->io_epoch;
-
- mdc_set_open_replay_data(&fd->fd_mds_och, it->d.lustre.it_data);
-
RETURN(0);
}
RETURN(-ENOMEM);
if (!it || !it->d.lustre.it_disposition) {
+ /* Convert f_flags into access mode. We cannot use file->f_mode,
+ * because everything but O_ACCMODE mask was stripped from
+ * there */
+ if ((oit.it_flags + 1) & O_ACCMODE)
+ oit.it_flags++;
+ if (oit.it_flags & O_TRUNC)
+ oit.it_flags |= FMODE_WRITE;
+
+ if (oit.it_flags & O_CREAT)
+ oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+ /* We do not want O_EXCL here, presumably we opened the file
+ * already? XXX - NFS implications? */
+ oit.it_flags &= ~O_EXCL;
+
it = &oit;
rc = ll_intent_file_open(file, NULL, 0, it);
if (rc) {
lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
l_lock(&lock->l_resource->lr_namespace->ns_lock);
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
kms = ldlm_extent_shift_kms(NULL, kms);
if (lsm->lsm_oinfo[stripe].loi_kms != kms)
LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
lsm->lsm_oinfo[stripe].loi_kms, kms);
lsm->lsm_oinfo[stripe].loi_kms = kms;
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
l_unlock(&lock->l_resource->lr_namespace->ns_lock);
}
/* A glimpse is necessary to determine whether we return a
* short read (B) or some zeroes at the end of the buffer (C) */
ll_inode_size_unlock(inode, 1);
- retval = ll_glimpse_size(inode, 0);
+ retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
if (retval)
goto out;
} else {
if (rc != 0)
RETURN(rc);
- /* this is ok, g_f_w will overwrite this under i_sem if it races
+ /* this is ok, g_f_w will overwrite this under i_mutex if it races
* with a local truncate, it just makes our maxbyte checking easier */
if (file->f_flags & O_APPEND)
*ppos = inode->i_size;
CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
inode->i_ino, count, *ppos);
- /* generic_file_write handles O_APPEND after getting i_sem */
+ /* generic_file_write handles O_APPEND after getting i_mutex */
retval = generic_file_write(file, buf, count, ppos);
out:
RETURN(retval);
}
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
+ read_actor_t actor, void *target)
+{
+ struct inode *inode = in_file->f_dentry->d_inode;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
+ struct ll_lock_tree tree;
+ struct ll_lock_tree_node *node;
+ struct ost_lvb lvb;
+ struct ll_ra_read bead;
+ int rc;
+ ssize_t retval;
+ __u64 kms;
+ ENTRY;
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
+ inode->i_ino, inode->i_generation, inode, count, *ppos);
+
+ /* "If nbyte is 0, read() will return 0 and have no other results."
+ * -- Single Unix Spec */
+ if (count == 0)
+ RETURN(0);
+
+ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
+ count);
+
+ /* File with no objects, nothing to lock */
+ if (!lsm)
+ RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
+
+ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
+ tree.lt_fd = LUSTRE_FPRIVATE(in_file);
+ rc = ll_tree_lock(&tree, node, NULL, count,
+ in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
+ if (rc != 0)
+ RETURN(rc);
+
+ ll_inode_size_lock(inode, 1);
+ /*
+ * Consistency guarantees: following possibilities exist for the
+ * relation between region being read and real file size at this
+ * moment:
+ *
+ * (A): the region is completely inside of the file;
+ *
+ * (B-x): x bytes of region are inside of the file, the rest is
+ * outside;
+ *
+ * (C): the region is completely outside of the file.
+ *
+ * This classification is stable under DLM lock acquired by
+ * ll_tree_lock() above, because to change class, other client has to
+ * take DLM lock conflicting with our lock. Also, any updates to
+ * ->i_size by other threads on this client are serialized by
+ * ll_inode_size_lock(). This guarantees that short reads are handled
+ * correctly in the face of concurrent writes and truncates.
+ */
+ inode_init_lvb(inode, &lvb);
+ obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
+ kms = lvb.lvb_size;
+ if (*ppos + count - 1 > kms) {
+ /* A glimpse is necessary to determine whether we return a
+ * short read (B) or some zeroes at the end of the buffer (C) */
+ ll_inode_size_unlock(inode, 1);
+ retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
+ if (retval)
+ goto out;
+ } else {
+ /* region is within kms and, hence, within real file size (A) */
+ inode->i_size = kms;
+ ll_inode_size_unlock(inode, 1);
+ }
+
+ CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
+ inode->i_ino, count, *ppos, inode->i_size);
+
+ /* turn off the kernel's read-ahead */
+ in_file->f_ra.ra_pages = 0;
+
+ bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
+ bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+ ll_ra_read_in(in_file, &bead);
+ /* BUG: 5972 */
+ file_accessed(in_file);
+ retval = generic_file_sendfile(in_file, ppos, count, actor, target);
+ ll_ra_read_ex(in_file, &bead);
+
+ out:
+ ll_tree_unlock(&tree);
+ RETURN(retval);
+}
+#endif
+
static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
unsigned long arg)
{
if (!f)
GOTO(out, -ENOMEM);
- f->f_dentry = file->f_dentry;
- f->f_vfsmnt = file->f_vfsmnt;
+ f->f_dentry = dget(file->f_dentry);
+ f->f_vfsmnt = mntget(file->f_vfsmnt);
rc = ll_intent_file_open(f, lum, lum_size, &oit);
if (rc)
out:
if (f)
- put_filp(f);
+ fput(f);
ll_file_data_put(fd);
up(&lli->lli_open_sem);
if (req != NULL)
if (f == NULL)
GOTO(out, rc = -ENOMEM);
- f->f_dentry = head_filp->f_dentry;
- f->f_vfsmnt = head_filp->f_vfsmnt;
+ f->f_dentry = dget(head_filp->f_dentry);
+ f->f_vfsmnt = mntget(head_filp->f_vfsmnt);
ll_prepare_mdc_op_data(op_data, head_inode, tail_parent,
tail_dentry->d_name.name,
if (op_data)
OBD_FREE_PTR(op_data);
if (f)
- put_filp(f);
+ fput(f);
ll_file_data_put(fd);
ptlrpc_req_finished(req);
RETURN(rc);
static int ll_file_join(struct inode *head, struct file *filp,
char *filename_tail)
{
- struct inode *tail = NULL, *first, *second;
+ struct inode *tail = NULL, *first = NULL, *second = NULL;
struct dentry *tail_dentry;
struct file *tail_filp, *first_filp, *second_filp;
struct ll_lock_tree first_tree, second_tree;
RETURN(rc);
}
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+ struct inode *inode = dentry->d_inode;
+ struct obd_client_handle *och;
+ int rc;
+ ENTRY;
+
+ LASSERT(inode);
+
+ /* Root ? Do nothing. */
+ if (dentry->d_inode->i_sb->s_root == dentry)
+ RETURN(0);
+
+ /* No open handle to close? Move away */
+ if (!it_disposition(it, DISP_OPEN_OPEN))
+ RETURN(0);
+
+ OBD_ALLOC(och, sizeof(*och));
+ if (!och)
+ GOTO(out, rc = -ENOMEM);
+
+ ll_och_fill(ll_i2info(inode), it, och);
+
+ rc = ll_close_inode_openhandle(inode, och);
+
+ OBD_FREE(och, sizeof(*och));
+ out:
+ /* this one is in place of ll_file_open */
+ ptlrpc_req_finished(it->d.lustre.it_data);
+ RETURN(rc);
+}
+
int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg)
{
if (get_user(flags, (int *) arg))
RETURN(-EFAULT);
- if (cmd == LL_IOC_SETFLAGS)
+ if (cmd == LL_IOC_SETFLAGS) {
+ if ((flags & LL_FILE_IGNORE_LOCK) &&
+ !(file->f_flags & O_DIRECT)) {
+ CERROR("%s: unable to disable locking on "
+ "non-O_DIRECT file\n", current->comm);
+ RETURN(-EINVAL);
+ }
+
fd->fd_flags |= flags;
- else
+ } else {
fd->fd_flags &= ~flags;
+ }
RETURN(0);
case LL_IOC_LOV_SETSTRIPE:
RETURN(ll_lov_setstripe(inode, file, arg));
}
ll_inode2fid(&fid, inode);
rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
+ if (rc == -ENOENT) { /* Already unlinked. Just update nlink
+ * and return success */
+ inode->i_nlink = 0;
+ /* This path cannot be hit for regular files unless in
+ * case of obscure races, so * no need to to validate
+ * size. */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode))
+ RETURN(0);
+ }
+
if (rc) {
CERROR("failure %d inode %lu\n", rc, inode->i_ino);
RETURN(-abs(rc));
}
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
- struct lookup_intent *it, struct kstat *stat)
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+ struct lookup_intent *it, struct kstat *stat)
{
struct inode *inode = de->d_inode;
int res = 0;
return 0;
}
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ return ll_getattr_it(mnt, de, &it, stat);
+}
#endif
static
.mmap = ll_file_mmap,
.llseek = ll_file_seek,
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
- .sendfile = generic_file_sendfile,
+ .sendfile = ll_file_sendfile,
#endif
.fsync = ll_fsync,
/* .lock = ll_file_flock */
.mmap = ll_file_mmap,
.llseek = ll_file_seek,
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
- .sendfile = generic_file_sendfile,
+ .sendfile = ll_file_sendfile,
#endif
.fsync = ll_fsync,
.lock = ll_file_flock
.setattr = ll_setattr,
.truncate = ll_truncate,
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- .getattr_it = ll_getattr,
+ .getattr_it = ll_getattr_it,
#else
.revalidate_it = ll_inode_revalidate_it,
#endif
struct ll_close_queue *lcq = arg;
ENTRY;
- /* XXX boiler-plate */
{
- char name[sizeof(current->comm)];
- unsigned long flags;
+ char name[CFS_CURPROC_COMM_MAX];
snprintf(name, sizeof(name) - 1, "ll_close");
cfs_daemonize(name);
- SIGNAL_MASK_LOCK(current, flags);
- sigfillset(¤t->blocked);
- RECALC_SIGPENDING;
- SIGNAL_MASK_UNLOCK(current, flags);
}
-
+
complete(&lcq->lcq_comp);
while (1) {
/* for writepage() only to communicate to fsync */
int lli_async_rc;
- struct file_operations *ll_save_ifop;
- struct file_operations *ll_save_ffop;
- struct file_operations *ll_save_wfop;
- struct file_operations *ll_save_wrfop;
-
struct posix_acl *lli_posix_acl;
struct list_head lli_dead_list;
/* default to about 40meg of readahead on a given system. That much tied
* up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
enum ra_stat {
RA_STAT_HIT = 0,
RA_STAT_MISS,
RA_STAT_ZERO_WINDOW,
RA_STAT_EOF,
RA_STAT_MAX_IN_FLIGHT,
+ RA_STAT_WRONG_GRAB_PAGE,
_NR_RA_STAT,
};
struct ll_ra_info {
unsigned long ra_cur_pages;
unsigned long ra_max_pages;
+ unsigned long ra_max_read_ahead_whole_pages;
unsigned long ra_stats[_NR_RA_STAT];
};
* case, it probably doesn't make sense to expand window to
* PTLRPC_MAX_BRW_PAGES on the third access.
*/
- unsigned long ras_consecutive;
+ unsigned long ras_consecutive_pages;
+ /*
+ * number of read requests after the last read-ahead window reset
+ * As window is reset on each seek, this is effectively the number
+ * on consecutive read request and is used to trigger read-ahead.
+ */
+ unsigned long ras_consecutive_requests;
/*
* Parameters of current read-ahead window. Handled by
* ras_update(). On the initial access to the file or after a seek,
*/
unsigned long ras_next_readahead;
/*
+ * Total number of ll_file_read requests issued, reads originating
+ * due to mmap are not counted in this total. This value is used to
+ * trigger full file read-ahead after multiple reads to a small file.
+ */
+ unsigned long ras_requests;
+ /*
+ * Page index with respect to the current request, these value
+ * will not be accurate when dealing with reads issued via mmap.
+ */
+ unsigned long ras_request_index;
+ /*
* list of struct ll_ra_read's one per read(2) call current in
* progress against this file descriptor. Used by read-ahead code,
* protected by ->ras_lock.
int ll_glimpse_size(struct inode *inode, int ast_flags);
int ll_local_open(struct file *file,
struct lookup_intent *it, struct ll_file_data *fd);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
struct file *file);
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
#endif
struct ll_file_data *ll_file_data_get(void);
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
__u32 get_uuid2int(const char *name, int len);
struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
int fhtype, int parent);
#define LL_MAX_BLKSIZE (4UL * 1024 * 1024)
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->s_fs_info))
#define ll_s2sbi_nocast(sb) ((sb)->s_fs_info)
void __d_rehash(struct dentry * entry, int lock);
static inline __u64 ll_ts2u64(struct timespec *time)
return t;
}
#else /* 2.4 here */
-#define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->u.generic_sbp))
#define ll_s2sbi_nocast(sb) ((sb)->u.generic_sbp)
static inline __u64 ll_ts2u64(time_t *time)
{
return *time;
}
#endif
+#define ll_s2sbi(sb) ((struct ll_sb_info *)ll_s2sbi_nocast(sb))
/* don't need an addref as the sb_info should be holding one */
static inline struct obd_export *ll_s2obdexp(struct super_block *sb)
sbi->ll_async_page_max = (num_physpages / 4) * 3;
sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8,
SBI_DEFAULT_READAHEAD_MAX);
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+ SBI_DEFAULT_READAHEAD_WHOLE_MAX;
INIT_LIST_HEAD(&sbi->ll_conn_chain);
INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
}
/* indicate that inodebits locking is supported by this client */
- data->ocd_connect_flags |= OBD_CONNECT_IBITS;
+ data->ocd_connect_flags |= OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH;
data->ocd_ibits_known = MDS_INODELOCK_FULL;
if (sb->s_flags & MS_RDONLY)
* on all clients. */
/* s_dev is also used in lt_compare() to compare two fs, but that is
* only a node-local comparison. */
- sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid,
- strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid));
+ sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid,
+ strlen(sbi2mdc(sbi)->cl_target_uuid.uuid));
#endif
obd = class_name2obd(osc);
if (!obd) {
CERROR("OSC %s: not setup or attached\n", osc);
- GOTO(out_mdc, err);
+ GOTO(out_mdc, err = -ENODEV);
}
data->ocd_connect_flags =
- OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL;
+ OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL;
CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
"ocd_grant: %d\n", data->ocd_connect_flags,
sbi->ll_rootino = rootfid.id;
sb->s_op = &lustre_super_operations;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ sb->s_export_op = &lustre_export_operations;
+#endif
/* make root inode
* XXX: move this to after cbd setup? */
}
/* Try all connections, but only once. */
- rc = obd_set_info(obd->obd_self_export,
- strlen("init_recov_bk"), "init_recov_bk",
- sizeof(recov_bk), &recov_bk);
+ rc = obd_set_info_async(obd->obd_self_export,
+ strlen("init_recov_bk"), "init_recov_bk",
+ sizeof(recov_bk), &recov_bk, NULL);
if (rc)
GOTO(out_cleanup, rc);
break;
case -EINVAL:
LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
- "from the MDS. Make sure this client and the "
- "MDS are running compatible versions of "
+ "from the MDS '%s'. Make sure this client and "
+ "the MDS are running compatible versions of "
"Lustre.\n",
- obd->obd_name, profile);
+ obd->obd_name, profile, lmd->lmd_mds);
/* fall through */
default:
- CERROR("class_config_parse_llog failed: rc = %d\n", rc);
+ LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
+ "from the MDS '%s'. This may be the result of "
+ "communication errors between the client and "
+ "the MDS, or if the MDS is not running.\n",
+ obd->obd_name, profile, lmd->lmd_mds);
break;
}
LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
CURRENT_SECONDS);
-
/* NB: ATTR_SIZE will only be set after this point if the size
* resides on the MDS, ie, this file has no objects. */
if (lsm)
if (rc) {
ptlrpc_req_finished(request);
- if (rc != -EPERM && rc != -EACCES)
+ if (rc == -ENOENT) {
+ inode->i_nlink = 0;
+ /* Unlinked special device node? Or just a race?
+ * Pretend we done everything. */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode))
+ rc = inode_setattr(inode, attr);
+ } else if (rc != -EPERM && rc != -EACCES) {
CERROR("mdc_setattr fails: rc = %d\n", rc);
+ }
RETURN(rc);
}
if (attr->ia_size == 0)
ast_flags = LDLM_AST_DISCARD_DATA;
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
UP_WRITE_I_ALLOC_SEM(inode);
rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh,
ast_flags);
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
DOWN_WRITE_I_ALLOC_SEM(inode);
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
#else
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
DOWN_WRITE_I_ALLOC_SEM(inode);
#endif
if (rc != 0)
int ll_setattr(struct dentry *de, struct iattr *attr)
{
- LBUG(); /* code is unused, but leave this in case of VFS changes */
- RETURN(-ENOSYS);
+ return ll_setattr_raw(de->d_inode, attr);
}
int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
#else
init_special_inode(inode, inode->i_mode, inode->i_rdev);
#endif
- lli->ll_save_ifop = inode->i_fop;
-
- if (S_ISCHR(inode->i_mode))
- inode->i_fop = &ll_special_chr_inode_fops;
- else if (S_ISBLK(inode->i_mode))
- inode->i_fop = &ll_special_blk_inode_fops;
- else if (S_ISFIFO(inode->i_mode))
- inode->i_fop = &ll_special_fifo_inode_fops;
- else if (S_ISSOCK(inode->i_mode))
- inode->i_fop = &ll_special_sock_inode_fops;
EXIT;
}
}
}
case EXT3_IOC_SETFLAGS: {
struct mdc_op_data op_data;
- struct iattr attr;
+ struct ll_iattr_struct attr;
struct obdo *oa;
struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
memset(&attr, 0x0, sizeof(attr));
attr.ia_attr_flags = flags;
- attr.ia_valid |= ATTR_ATTR_FLAG;
+ ((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG;
rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
- &attr, NULL, 0, NULL, 0, &req);
+ (struct iattr *)&attr, NULL, 0, NULL, 0, &req);
if (rc || lsm == NULL) {
ptlrpc_req_finished(req);
obdo_free(oa);
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
read_only = *flags & MS_RDONLY;
- err = obd_set_info(sbi->ll_mdc_exp, strlen("read-only"),
- "read-only", sizeof(read_only), &read_only);
+ err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"),
+ "read-only", sizeof(read_only),
+ &read_only, NULL);
if (err) {
CERROR("Failed to change the read-only flag during "
"remount: %d\n", err);
struct ll_sb_info *sbi = NULL;
struct obd_device *client_obd = NULL, *lov_obd = NULL;
struct lov_obd *lov = NULL;
- struct obd_import *client_imp = NULL;
struct obd_statfs stat_buf = {0};
char *buf = NULL;
struct obd_ioctl_data *data = NULL;
if (index > 0)
GOTO(out_statfs, rc = -ENODEV);
client_obd = class_exp2obd(sbi->ll_mdc_exp);
- client_imp = class_exp2cliimp(sbi->ll_mdc_exp);
} else if (type == LL_STATFS_LOV) {
lov_obd = class_exp2obd(sbi->ll_osc_exp);
lov = &lov_obd->u.lov;
GOTO(out_statfs, rc = -ENODEV);
client_obd = class_exp2obd(lov->tgts[index].ltd_exp);
- client_imp = class_exp2cliimp(lov->tgts[index].ltd_exp);
if (!lov->tgts[index].active)
GOTO(out_uuid, rc = -ENODATA);
}
- if (!client_obd || !client_imp)
+ if (!client_obd)
GOTO(out_statfs, rc = -EINVAL);
rc = obd_statfs(client_obd, &stat_buf, jiffies - 1);
GOTO(out_statfs, rc = -EFAULT);
out_uuid:
- if (copy_to_user(data->ioc_pbuf2, &client_imp->imp_target_uuid,
+ if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(client_obd),
data->ioc_plen2))
rc = -EFAULT;
if (pgoff >= size) {
lov_stripe_unlock(lsm);
- ll_glimpse_size(inode, 0);
+ ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
} else {
/* XXX change inode size without ll_inode_size_lock() held!
* there is a race condition with truncate path. (see
}
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#ifndef HAVE_FILEMAP_POPULATE
+static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+#endif
static int ll_populate(struct vm_area_struct *area, unsigned long address,
unsigned long len, pgprot_t prot, unsigned long pgoff,
int nonblock)
rc = generic_file_mmap(file, vma);
if (rc == 0) {
+#if !defined(HAVE_FILEMAP_POPULATE) && \
+ (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ if (!filemap_populate)
+ filemap_populate = vma->vm_ops->populate;
+#endif
vma->vm_ops = &ll_file_vm_ops;
vma->vm_ops->open(vma);
/* update the inode's size and mtime */
{
struct inode *inode;
struct dentry *result;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct list_head *lp;
+#endif
if (ino == 0)
return ERR_PTR(-ESTALE);
return ERR_PTR(-ESTALE);
}
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+ result = d_alloc_anon(inode);
+ if (!result) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+#else
/* now to find a dentry.
* If possible, get a well-connected one
*/
}
result->d_flags |= DCACHE_DISCONNECTED;
+#endif
ll_set_dd(result);
result->d_op = &ll_d_ops;
return result;
*lenp = 3;
return 1;
}
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+struct dentry *ll_get_dentry(struct super_block *sb, void *data)
+{
+ __u32 *inump = (__u32*)data;
+ return ll_iget_for_nfs(sb, inump[0], inump[1], S_IFREG);
+}
+
+struct dentry *ll_get_parent(struct dentry *dchild)
+{
+ struct ptlrpc_request *req = NULL;
+ struct inode *dir = dchild->d_inode;
+ struct ll_sb_info *sbi;
+ struct dentry *result = NULL;
+ struct ll_fid fid;
+ struct mds_body *body;
+ char dotdot[] = "..";
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(dir && S_ISDIR(dir->i_mode));
+
+ sbi = ll_s2sbi(dir->i_sb);
+
+ fid.id = (__u64)dir->i_ino;
+ fid.generation = dir->i_generation;
+ fid.f_type = S_IFDIR;
+
+ rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, dotdot, strlen(dotdot) + 1,
+ 0, 0, &req);
+ if (rc) {
+ CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+ return ERR_PTR(rc);
+ }
+ body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
+
+ LASSERT((body->valid & OBD_MD_FLGENER) && (body->valid & OBD_MD_FLID));
+
+ result = ll_iget_for_nfs(dir->i_sb, body->ino, body->generation, S_IFDIR);
+
+ if (IS_ERR(result))
+ rc = PTR_ERR(result);
+
+ ptlrpc_req_finished(req);
+ if (rc)
+ return ERR_PTR(rc);
+ RETURN(result);
+}
+
+struct export_operations lustre_export_operations = {
+ .get_parent = ll_get_parent,
+ .get_dentry = ll_get_dentry,
+};
+#endif
return rc;
if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) {
- CERROR("can't set readahead more than %lu MB\n",
+ CERROR("can't set file readahead more than %lu MB\n",
num_physpages >> (20 - PAGE_CACHE_SHIFT - 1));
return -ERANGE;
}
return count;
}
+static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ unsigned val;
+
+ spin_lock(&sbi->ll_lock);
+ val = sbi->ll_ra_info.ra_max_read_ahead_whole_pages >>
+ (20 - PAGE_CACHE_SHIFT);
+ spin_unlock(&sbi->ll_lock);
+
+ return snprintf(page, count, "%u\n", val);
+}
+
+static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ /* Cap this at the current max readahead window size, the readahead
+ * algorithm does this anyway so it's pointless to set it larger. */
+ if (val < 0 ||
+ val > (sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT))) {
+ CERROR("can't set max_read_ahead_whole_mb more than "
+ "max_read_ahead_mb: %lu\n",
+ sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT));
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+ val << (20 - PAGE_CACHE_SHIFT);
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+
static int ll_rd_max_cached_mb(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
else
sbi->ll_flags &= ~LL_SBI_CHECKSUM;
- rc = obd_set_info(sbi->ll_osc_exp, strlen("checksum"), "checksum",
- sizeof(val), &val);
+ rc = obd_set_info_async(sbi->ll_osc_exp, strlen("checksum"), "checksum",
+ sizeof(val), &val, NULL);
if (rc)
CWARN("Failed to set OSC checksum flags: %d\n", rc);
//{ "filegroups", lprocfs_rd_filegroups, 0, 0 },
{ "max_read_ahead_mb", ll_rd_max_readahead_mb,
ll_wr_max_readahead_mb, 0 },
+ { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
+ ll_wr_max_read_ahead_whole_mb, 0 },
{ "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
{ "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 },
{ 0 }
[RA_STAT_ZERO_WINDOW] = "zero size window",
[RA_STAT_EOF] = "read-ahead to EOF",
[RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+ [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
};
do_gettimeofday(&now);
__d_rehash(de, 0);
}
+/* 2.6.15 and prior versions have buggy d_instantiate_unique that leaks an inode
+ * if suitable alias is found. But we are not going to fix it by just freeing
+ * such inode, because if some vendor's kernel contains this bugfix already,
+ * we will break everything then. We will use our own reimplementation
+ * instead. */
+#if !defined(HAVE_D_ADD_UNIQUE) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16))
/* Search "inode"'s alias list for a dentry that has the same name and parent as
* de. If found, return it. If not found, return de. */
struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
return de;
}
+#else
+struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
+{
+ struct dentry *dentry;
+
+ dentry = d_add_unique(de, inode);
+ if (dentry) {
+ lock_dentry(dentry);
+ dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
+ unlock_dentry(dentry);
+ }
+
+ return dentry?dentry:de;
+}
+#endif
static int lookup_it_finish(struct ptlrpc_request *request, int offset,
struct lookup_intent *it, void *data)
GOTO(out, retval = ERR_PTR(rc));
}
+ if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+ !S_ISREG(dentry->d_inode->i_mode) &&
+ !S_ISDIR(dentry->d_inode->i_mode)) {
+ ll_release_openhandle(dentry, it);
+ }
ll_lookup_finish_locks(it, dentry);
if (dentry == save)
RETURN(0);
}
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
-{
- return ll_create_it(dir, dentry, mode, &nd->intent);
-}
-#endif
-
static void ll_update_times(struct ptlrpc_request *request, int offset,
struct inode *inode)
{
LTIME_S(inode->i_ctime) = body->ctime;
}
-static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+ unsigned rdev, struct dentry *dchild)
{
struct ptlrpc_request *request = NULL;
- struct inode *dir = nd->dentry->d_inode;
+ struct inode *inode = NULL;
struct ll_sb_info *sbi = ll_i2sbi(dir);
struct mdc_op_data op_data;
int err;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
- nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir,
+ name->len, name->name, dir->i_ino, dir->i_generation, dir,
mode, rdev);
mode &= ~current->fs->umask;
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
- nd->last.len, 0);
+ ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+ name->len, 0);
err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
current->fsuid, current->fsgid,
current->cap_effective, rdev, &request);
- if (err == 0)
- ll_update_times(request, 0, dir);
- ptlrpc_req_finished(request);
+ if (err)
+ break;
+ ll_update_times(request, 0, dir);
+
+ if (dchild) {
+ err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
+ dchild->d_sb);
+ if (err)
+ break;
+
+ d_instantiate(dchild, inode);
+ }
break;
case S_IFDIR:
err = -EPERM;
default:
err = -EINVAL;
}
+ ptlrpc_req_finished(request);
RETURN(err);
}
-static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode,
- ll_dev_t rdev)
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
{
- struct ptlrpc_request *request = NULL;
- struct inode *inode = NULL;
- struct ll_sb_info *sbi = ll_i2sbi(dir);
- struct mdc_op_data op_data;
- int err;
- ENTRY;
-
- CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
- dchild->d_name.len, dchild->d_name.name,
- dir->i_ino, dir->i_generation, dir);
- mode &= ~current->fs->umask;
-
- switch (mode & S_IFMT) {
- case 0:
- case S_IFREG:
- mode |= S_IFREG; /* for mode = 0 case, fallthrough */
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
- ll_prepare_mdc_op_data(&op_data, dir, NULL, dchild->d_name.name,
- dchild->d_name.len, 0);
- err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
- current->fsuid, current->fsgid,
- current->cap_effective, rdev, &request);
- if (err)
- GOTO(out_err, err);
-
- ll_update_times(request, 0, dir);
-
- err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
- dchild->d_sb);
- if (err)
- GOTO(out_err, err);
- break;
- case S_IFDIR:
- RETURN(-EPERM);
- break;
- default:
- RETURN(-EINVAL);
+ if (!nd || !nd->intent.d.lustre.it_disposition) {
+ /* No saved request? Just mknod the file */
+ return ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
}
- d_instantiate(dchild, inode);
- out_err:
- ptlrpc_req_finished(request);
- RETURN(err);
+ return ll_create_it(dir, dentry, mode, &nd->intent);
}
+#endif
-static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+ const char *tgt)
{
- struct inode *dir = nd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct ll_sb_info *sbi = ll_i2sbi(dir);
struct mdc_op_data op_data;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%s\n",
- nd->last.len, nd->last.name, dir->i_ino, dir->i_generation,
+ name->len, name->name, dir->i_ino, dir->i_generation,
dir, tgt);
- ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
- nd->last.len, 0);
+ ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+ name->len, 0);
err = mdc_create(sbi->ll_mdc_exp, &op_data,
tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
current->fsuid, current->fsgid, current->cap_effective,
RETURN(err);
}
-static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+static int ll_link_generic(struct inode *src, struct inode *dir,
+ struct qstr *name)
{
- struct inode *src = srcnd->dentry->d_inode;
- struct inode *dir = tgtnd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct mdc_op_data op_data;
int err;
CDEBUG(D_VFSTRACE,
"VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
src->i_ino, src->i_generation, src, dir->i_ino,
- dir->i_generation, dir, tgtnd->last.len, tgtnd->last.name);
+ dir->i_generation, dir, name->len, name->name);
- ll_prepare_mdc_op_data(&op_data, src, dir, tgtnd->last.name,
- tgtnd->last.len, 0);
+ ll_prepare_mdc_op_data(&op_data, src, dir, name->name,
+ name->len, 0);
err = mdc_link(sbi->ll_mdc_exp, &op_data, &request);
if (err == 0)
ll_update_times(request, 0, dir);
RETURN(err);
}
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name, int mode,
+ struct dentry *dchild)
-static int ll_mkdir_raw(struct nameidata *nd, int mode)
{
- struct inode *dir = nd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct ll_sb_info *sbi = ll_i2sbi(dir);
struct mdc_op_data op_data;
+ struct inode *inode = NULL;
int err;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
- nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+ name->len, name->name, dir->i_ino, dir->i_generation, dir);
mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
- ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
- nd->last.len, 0);
+ ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+ name->len, 0);
err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
current->fsuid, current->fsgid, current->cap_effective,
0, &request);
- if (err == 0)
- ll_update_times(request, 0, dir);
+ if (err)
+ GOTO(out, err);
+ ll_update_times(request, 0, dir);
+ if (dchild) {
+ err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
+ dchild->d_sb);
+ if (err)
+ GOTO(out, err);
+ d_instantiate(dchild, inode);
+ }
+ EXIT;
+out:
ptlrpc_req_finished(request);
- RETURN(err);
+ return err;
}
-static int ll_rmdir_raw(struct nameidata *nd)
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+ struct qstr *name)
{
- struct inode *dir = nd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct mdc_op_data op_data;
struct dentry *dentry;
int rc;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
- nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+ name->len, name->name, dir->i_ino, dir->i_generation, dir);
/* Check if we have something mounted at the dir we are going to delete
* In such a case there would always be dentry present. */
- dentry = d_lookup(nd->dentry, &nd->last);
- if (dentry) {
- int mounted = d_mountpoint(dentry);
- dput(dentry);
- if (mounted)
- RETURN(-EBUSY);
+ if (dparent) {
+ dentry = d_lookup(dparent, name);
+ if (dentry) {
+ int mounted = d_mountpoint(dentry);
+ dput(dentry);
+ if (mounted)
+ RETURN(-EBUSY);
+ }
}
- ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
- nd->last.len, S_IFDIR);
+ ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+ name->len, S_IFDIR);
rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request);
if (rc == 0)
ll_update_times(request, 0, dir);
return rc;
}
-static int ll_unlink_raw(struct nameidata *nd)
+static int ll_unlink_generic(struct inode * dir, struct qstr *name)
{
- struct inode *dir = nd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct mdc_op_data op_data;
int rc;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
- nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+ name->len, name->name, dir->i_ino, dir->i_generation, dir);
- ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
- nd->last.len, 0);
+ ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+ name->len, 0);
rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request);
if (rc)
GOTO(out, rc);
RETURN(rc);
}
-static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+static int ll_rename_generic(struct inode *src, struct qstr *src_name,
+ struct inode *tgt, struct qstr *tgt_name)
{
- struct inode *src = srcnd->dentry->d_inode;
- struct inode *tgt = tgtnd->dentry->d_inode;
struct ptlrpc_request *request = NULL;
struct ll_sb_info *sbi = ll_i2sbi(src);
struct mdc_op_data op_data;
int err;
ENTRY;
CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
- "tgt_dir=%lu/%u(%p)\n", srcnd->last.len, srcnd->last.name,
- src->i_ino, src->i_generation, src, tgtnd->last.len,
- tgtnd->last.name, tgt->i_ino, tgt->i_generation, tgt);
+ "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+ src->i_ino, src->i_generation, src, tgt_name->len,
+ tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
ll_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0);
err = mdc_rename(sbi->ll_mdc_exp, &op_data,
- srcnd->last.name, srcnd->last.len,
- tgtnd->last.name, tgtnd->last.len, &request);
+ src_name->name, src_name->len,
+ tgt_name->name, tgt_name->len, &request);
if (!err) {
ll_update_times(request, 0, src);
ll_update_times(request, 0, tgt);
RETURN(err);
}
+static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
+{
+ return ll_mknod_generic(nd->dentry->d_inode, &nd->last, mode,rdev,NULL);
+}
+static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+{
+ return ll_rename_generic(srcnd->dentry->d_inode, &srcnd->last,
+ tgtnd->dentry->d_inode, &tgtnd->last);
+}
+static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+{
+ return ll_link_generic(srcnd->dentry->d_inode, tgtnd->dentry->d_inode,
+ &tgtnd->last);
+}
+static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
+{
+ return ll_symlink_generic(nd->dentry->d_inode, &nd->last, tgt);
+}
+static int ll_rmdir_raw(struct nameidata *nd)
+{
+ return ll_rmdir_generic(nd->dentry->d_inode, nd->dentry, &nd->last);
+}
+static int ll_mkdir_raw(struct nameidata *nd, int mode)
+{
+ return ll_mkdir_generic(nd->dentry->d_inode, &nd->last, mode, NULL);
+}
+static int ll_unlink_raw(struct nameidata *nd)
+{
+ return ll_unlink_generic(nd->dentry->d_inode, &nd->last);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode,
+ ll_dev_t rdev)
+{
+ return ll_mknod_generic(dir, &dchild->d_name, mode,
+ old_encode_dev(rdev), dchild);
+}
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+ return ll_unlink_generic(dir, &dentry->d_name);
+}
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return ll_rmdir_generic(dir, NULL, &dentry->d_name);
+}
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+ const char *oldname)
+{
+ return ll_symlink_generic(dir, &dentry->d_name, oldname);
+}
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name);
+}
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ return ll_rename_generic(old_dir, &old_dentry->d_name, new_dir,
+ &new_dentry->d_name);
+}
+#endif
+
struct inode_operations ll_dir_inode_operations = {
.link_raw = ll_link_raw,
.unlink_raw = ll_unlink_raw,
#else
.lookup = ll_lookup_nd,
.create = ll_create_nd,
- .getattr_it = ll_getattr,
+ .getattr_it = ll_getattr_it,
+ /* We need all these non-raw things for NFSD, to not patch it. */
+ .unlink = ll_unlink,
+ .mkdir = ll_mkdir,
+ .rmdir = ll_rmdir,
+ .symlink = ll_symlink,
+ .link = ll_link,
+ .rename = ll_rename,
+ .setattr = ll_setattr,
+ .getattr = ll_getattr,
#endif
.permission = ll_inode_permission,
.setxattr = ll_setxattr,
.listxattr = ll_listxattr,
.removexattr = ll_removexattr,
};
+
+struct inode_operations ll_special_inode_operations = {
+ .setattr_raw = ll_setattr_raw,
+ .setattr = ll_setattr,
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+ .getattr_it = ll_getattr_it,
+#else
+ .revalidate_it = ll_inode_revalidate_it,
+#endif
+ .permission = ll_inode_permission,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+};
/* this isn't where truncate starts. roughly:
* sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs
- * DLM lock on [size, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
+ * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
* avoid races.
*
* must be called under ->lli_size_sem */
struct ll_async_page *llap_cast_private(struct page *page)
{
- struct ll_async_page *llap = (struct ll_async_page *)page->private;
+ struct ll_async_page *llap = (struct ll_async_page *)page_private(page);
LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC,
"page %p private %lu gave magic %d which != %d\n",
- page, page->private, llap->llap_magic, LLAP_MAGIC);
+ page, page_private(page), llap->llap_magic, LLAP_MAGIC);
return llap;
}
struct ll_async_page *llap;
struct obd_export *exp;
struct inode *inode = page->mapping->host;
- struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_sb_info *sbi;
int rc;
ENTRY;
+ if (!inode) {
+ static int triggered;
+
+ if (!triggered) {
+ LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon "
+ "page received\n");
+ libcfs_debug_dumpstack(NULL);
+ triggered = 1;
+ }
+ RETURN(ERR_PTR(-EINVAL));
+ }
+ sbi = ll_i2sbi(inode);
LASSERT(ll_async_page_slab);
LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin);
/* sync pages or failed read pages can leave pages in the page
* cache that don't have our data associated with them anymore */
- if (page->private == 0) {
+ if (page_private(page) == 0) {
EXIT;
return;
}
}
#define RAS_CDEBUG(ras) \
- CDEBUG(D_READA, "lrp %lu c %lu ws %lu wl %lu nra %lu\n", \
- ras->ras_last_readpage, ras->ras_consecutive, \
- ras->ras_window_start, ras->ras_window_len, \
- ras->ras_next_readahead);
+ CDEBUG(D_READA, \
+ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu\n", \
+ ras->ras_last_readpage, ras->ras_consecutive_requests, \
+ ras->ras_consecutive_pages, ras->ras_window_start, \
+ ras->ras_window_len, ras->ras_next_readahead, \
+ ras->ras_requests, ras->ras_request_index);
static int index_in_window(unsigned long index, unsigned long point,
unsigned long before, unsigned long after)
struct ll_readahead_state *ras;
ras = ll_ras_get(f);
- rar->lrr_reader = current;
spin_lock(&ras->ras_lock);
+ ras->ras_requests++;
+ ras->ras_request_index = 0;
+ ras->ras_consecutive_requests++;
+ rar->lrr_reader = current;
+
list_add(&rar->lrr_linkage, &ras->ras_read_beads);
spin_unlock(&ras->ras_lock);
}
spin_lock(&ras->ras_lock);
bead = ll_ra_read_get_locked(ras);
- /* reserve a part of the read-ahead window that we'll be issuing */
+ /* Enlarge the RA window to encompass the full read */
+ if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+ bead->lrr_start + bead->lrr_count) {
+ ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+ ras->ras_window_start;
+ }
+ /* Reserve a part of the read-ahead window that we'll be issuing */
if (ras->ras_window_len) {
start = ras->ras_next_readahead;
end = ras->ras_window_start + ras->ras_window_len - 1;
}
- if (bead != NULL) {
- pgoff_t read_end;
-
- start = max(start, bead->lrr_start);
- read_end = bead->lrr_start + bead->lrr_count - 1;
- if (ras->ras_consecutive > start - bead->lrr_start + 1)
- /*
- * if current read(2) is a part of larger sequential
- * read, make sure read-ahead is at least to the end
- * of the read region.
- *
- * XXX nikita: This doesn't work when some pages in
- * [lrr_start, start] were cached (and, as a result,
- * weren't counted in ->ras_consecutive).
- */
- end = max(end, read_end);
- else
- /*
- * otherwise, clip read-ahead at the read boundary.
- */
- end = read_end;
- }
if (end != 0) {
+ /* Truncate RA window to end of file */
end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
ras->ras_next_readahead = max(end, end + 1);
RAS_CDEBUG(ras);
continue;
}
+ /* Check if page was truncated or reclaimed */
+ if (page->mapping != mapping) {
+ ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
+ CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
+ goto next_page;
+ }
+
/* we do this first so that we can see the page in the /proc
* accounting */
llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
{
ras->ras_last_readpage = index;
- ras->ras_consecutive = 1;
+ ras->ras_consecutive_requests = 0;
+ ras->ras_consecutive_pages = 0;
ras->ras_window_len = 0;
ras_set_start(ras, index);
- ras->ras_next_readahead = ras->ras_window_start;
+ ras->ras_next_readahead = max(ras->ras_window_start, index);
RAS_CDEBUG(ras);
}
{
spin_lock_init(&ras->ras_lock);
ras_reset(ras, 0);
+ ras->ras_requests = 0;
INIT_LIST_HEAD(&ras->ras_read_beads);
}
-static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras,
- unsigned long index, unsigned hit)
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+ struct ll_readahead_state *ras, unsigned long index,
+ unsigned hit)
{
struct ll_ra_info *ra = &sbi->ll_ra_info;
int zero = 0;
ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
}
+ /* On the second access to a file smaller than the tunable
+ * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+ * file up to ra_max_pages. This is simply a best effort and
+ * only occurs once per open file. Normal RA behavior is reverted
+ * to for subsequent IO. The mmap case does not increment
+ * ras_requests and thus can never trigger this behavior. */
+ if (ras->ras_requests == 2 && !ras->ras_request_index) {
+ __u64 kms_pages;
+
+ kms_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+ ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+
+ if (kms_pages &&
+ kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+ ras->ras_window_start = 0;
+ ras->ras_last_readpage = 0;
+ ras->ras_next_readahead = 0;
+ ras->ras_window_len = min(ra->ra_max_pages,
+ ra->ra_max_read_ahead_whole_pages);
+ GOTO(out_unlock, 0);
+ }
+ }
+
if (zero) {
ras_reset(ras, index);
GOTO(out_unlock, 0);
}
ras->ras_last_readpage = index;
- ras->ras_consecutive++;
+ ras->ras_consecutive_pages++;
ras_set_start(ras, index);
ras->ras_next_readahead = max(ras->ras_window_start,
ras->ras_next_readahead);
- /* wait for a few pages to arrive before issuing readahead to avoid
- * the worst overutilization */
- if (ras->ras_consecutive == 3) {
+ /* Trigger RA in the mmap case where ras_consecutive_requests
+ * is not incremented and thus can't be used to trigger RA */
+ if (!ras->ras_window_len && ras->ras_consecutive_pages == 3) {
ras->ras_window_len = PTLRPC_MAX_BRW_PAGES;
GOTO(out_unlock, 0);
}
- /* we need to increase the window sometimes. we'll arbitrarily
- * do it half-way through the pages in an rpc */
- if ((index & (PTLRPC_MAX_BRW_PAGES - 1)) ==
- (PTLRPC_MAX_BRW_PAGES >> 1)) {
- ras->ras_window_len += PTLRPC_MAX_BRW_PAGES;
- ras->ras_window_len = min(ras->ras_window_len,
+ /* The initial ras_window_len is set to the request size. To avoid
+ * uselessly reading and discarding pages for random IO the window is
+ * only increased once per consecutive request received. */
+ if (ras->ras_consecutive_requests > 1 && !ras->ras_request_index) {
+ ras->ras_window_len = min(ras->ras_window_len +
+ PTLRPC_MAX_BRW_PAGES,
ra->ra_max_pages);
}
EXIT;
out_unlock:
RAS_CDEBUG(ras);
+ ras->ras_request_index++;
spin_unlock(&ras->ras_lock);
spin_unlock(&sbi->ll_lock);
return;
(((loff_t)page->index) << PAGE_SHIFT));
LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
+ if (!ll_i2info(inode)->lli_smd) {
+ /* File with no objects - one big hole */
+ /* We use this just for remove_from_page_cache that is not
+ * exported, we'd make page back up to date. */
+ ll_truncate_complete_page(page);
+ clear_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ RETURN(0);
+ }
+
rc = oig_init(&oig);
if (rc < 0)
GOTO(out, rc);
GOTO(out, rc = PTR_ERR(llap));
if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
- ras_update(ll_i2sbi(inode), &fd->fd_ras, page->index,
+ ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
llap->llap_defer_uptodate);
if (llap->llap_defer_uptodate) {
GOTO(out_oig, rc = 0);
}
- rc = ll_page_matches(page, fd->fd_flags);
- if (rc < 0) {
- LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
- GOTO(out, rc);
- }
+ if (likely((fd->fd_flags & LL_FILE_IGNORE_LOCK) == 0)) {
+ rc = ll_page_matches(page, fd->fd_flags);
+ if (rc < 0) {
+ LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
+ GOTO(out, rc);
+ }
- if (rc == 0) {
- CWARN("ino %lu page %lu (%llu) not covered by "
- "a lock (mmap?). check debug logs.\n",
- inode->i_ino, page->index,
- (long long)page->index << PAGE_CACHE_SHIFT);
+ if (rc == 0) {
+ CWARN("ino %lu page %lu (%llu) not covered by "
+ "a lock (mmap?). check debug logs.\n",
+ inode->i_ino, page->index,
+ (long long)page->index << PAGE_CACHE_SHIFT);
+ }
}
rc = ll_issue_page_read(exp, llap, oig, 0);
return 1;
}
-static int ll_releasepage(struct page *page, int gfp_mask)
+static int ll_releasepage(struct page *page, gfp_t gfp_mask)
{
if (PagePrivate(page))
ll_removepage(page);
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Special file handling for Lustre.
- *
- * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- * Author: Wang Di <wangdi@clusterfs.com>
- * Author: Andreas Dilger <adilger@clusterfs.com>
- *
- * This file is part of Lustre, http://www.lustre.org.
- *
- * Lustre is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Lustre is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Lustre; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_LLITE
-#include <lustre_dlm.h>
-#include <lustre_lite.h>
-#include <linux/pagemap.h>
-#include <linux/file.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/lustre_compat25.h>
-#endif
-#include <asm/poll.h>
-#include "llite_internal.h"
-
-#define INODE_OPS 1
-#define FILE_OPS 2
-
-static struct file_operations **get_save_fops(struct file* filp, int mode)
-{
- struct inode *inode = filp->f_dentry->d_inode;
- struct ll_inode_info *lli = ll_i2info(inode);
-
- if (mode == INODE_OPS) {
- return &(lli->ll_save_ifop);
- } else if (mode == FILE_OPS) {
- if (S_ISFIFO(inode->i_mode)) {
- switch (filp->f_mode) {
- case 1: /*O_RDONLY*/
- return &(lli->ll_save_ffop);
- case 2: /*O_WRONLY*/
- return &(lli->ll_save_wfop);
- case 3: /* O_RDWR */
- return &(lli->ll_save_wrfop);
- default:
- return NULL;
- }
- }
- return &(lli->ll_save_ffop);
- } else {
- CERROR("invalid special file ops %d\n", mode);
- LBUG();
- return NULL;
- }
-}
-
-static void save_fops(struct file *filp, struct inode *inode,
- struct file_operations *sfops)
-{
- if (sfops != filp->f_op) {
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-
- *pfop = filp->f_op;
- if (S_ISCHR(inode->i_mode))
- filp->f_op = &ll_special_chr_file_fops;
- else if (S_ISFIFO(inode->i_mode))
- filp->f_op = &ll_special_fifo_file_fops;
- }
-}
-
-static ssize_t ll_special_file_read(struct file *filp, char *buf,
- size_t count, loff_t *ppos)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->read)
- rc = (*pfop)->read(filp, buf, count, ppos);
-
- RETURN(rc);
-}
-
-static ssize_t ll_special_file_write(struct file *filp, const char *buf,
- size_t count, loff_t *ppos)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->write)
- rc = (*pfop)->write(filp, buf, count, ppos);
-
- RETURN(rc);
-}
-
-static int ll_special_file_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = -ENOTTY;
-
- if (pfop && *pfop && (*pfop)->ioctl) {
- struct file_operations *sfops = filp->f_op;
-
- rc = (*pfop)->ioctl(inode, filp, cmd, arg);
- save_fops(filp, inode, sfops);
- }
- RETURN(rc);
-}
-
-static loff_t ll_special_file_seek(struct file *filp, loff_t offset, int origin)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = 0;
-
- if (pfop && *pfop && (*pfop)->llseek)
- rc = (*pfop)->llseek(filp, offset, origin);
- else
- rc = default_llseek(filp, offset, origin);
-
- RETURN(rc);
-}
-
-
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-
-static unsigned int ll_special_file_poll(struct file *filp,
- struct poll_table_struct *poll_table)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = DEFAULT_POLLMASK;
-
- if (pfop && *pfop && (*pfop)->poll)
- rc = (*pfop)->poll(filp, poll_table);
-
- RETURN(rc);
-}
-
-static int ll_special_file_open(struct inode *inode, struct file *filp)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->open)
- rc = (*pfop)->open(inode, filp);
-
- RETURN(rc);
-}
-
-static ssize_t ll_special_read(struct file *filp, char *buf, size_t count,
- loff_t *ppos)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->read)
- rc = (*pfop)->read(filp, buf, count, ppos);
-
- RETURN(rc);
-}
-
-static ssize_t ll_special_write(struct file *filp, const char *buf,
- size_t count, loff_t *ppos)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->write)
- rc = (*pfop)->write(filp, buf, count, ppos);
-
- RETURN(rc);
-}
-
-static int ll_special_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- int rc = -ENOTTY;
-
- if (pfop && *pfop && (*pfop)->ioctl) {
- struct file_operations *sfops = filp->f_op;
-
- rc = (*pfop)->ioctl(inode, filp, cmd, arg);
-
- /* sometimes, file_operations will be changed in ioctl */
- save_fops(filp, inode, sfops);
- }
-
- RETURN(rc);
-}
-
-static int ll_special_mmap(struct file * filp, struct vm_area_struct * vma)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- int rc = -ENODEV;
-
- if (pfop && *pfop && (*pfop)->mmap)
- rc = (*pfop)->mmap(filp, vma);
-
- RETURN(rc);
-}
-
-static loff_t ll_special_seek(struct file *filp, loff_t offset, int origin)
-{
- struct file_operations** pfop = get_save_fops (filp, INODE_OPS);
- int rc = 0;
-
- if (pfop && *pfop && (*pfop)->llseek)
- rc = (*pfop)->llseek(filp, offset, origin);
- else
- rc = default_llseek(filp, offset, origin);
-
- RETURN(rc);
-}
-
-static int ll_special_fsync(struct file *filp, struct dentry *dentry, int data)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->fsync)
- rc = (*pfop)->fsync(filp, dentry, data);
-
- RETURN(rc);
-}
-
-static int ll_special_file_fasync(int fd, struct file *filp, int on)
-{
- struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
- int rc = -EINVAL;
-
- if (pfop && *pfop && (*pfop)->fasync)
- rc = (*pfop)->fasync(fd, filp, on);
-
- RETURN(rc);
-}
-
-static int ll_special_release_internal(struct inode *inode, struct file *filp,
- int mode)
-{
- struct file_operations **pfop = get_save_fops(filp, mode);
- struct ll_sb_info *sbi = ll_i2sbi(inode);
- int rc = 0, err;
- ENTRY;
-
- if (pfop && *pfop) {
- if ((*pfop)->release)
- rc = (*pfop)->release(inode, filp);
- /* FIXME fops_put */
- }
-
- lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
-
- err = ll_mdc_close(sbi->ll_mdc_exp, inode, filp);
- if (err && rc == 0)
- rc = err;
-
- RETURN(rc);
-}
-
-static int ll_special_open(struct inode *inode, struct file *filp)
-{
- struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
- struct file_operations *sfops = filp->f_op;
- struct ptlrpc_request *req;
- struct lookup_intent *it;
- struct ll_file_data *fd;
- int rc = -EINVAL, err;
- ENTRY;
-
- fd = ll_file_data_get();
- if (fd == NULL)
- RETURN(-ENOMEM);
-
- if (pfop && *pfop) {
- /* FIXME fops_get */
- if ((*pfop)->open) {
- rc = (*pfop)->open(inode, filp);
-
- /* sometimes file_operations will be changed in open */
- save_fops(filp, inode, sfops);
- }
- }
-
- lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
-
- it = filp->f_it;
-
- err = ll_local_open(filp, it, fd);
- if (rc != 0) {
- CERROR("error opening special file: rc %d\n", rc);
- ll_mdc_close(ll_i2sbi(inode)->ll_mdc_exp, inode, filp);
- } else if (err) {
- if (pfop && *pfop && (*pfop)->release)
- (*pfop)->release(inode, filp);
- /* FIXME fops_put */
- rc = err;
- }
-
- req = it->d.lustre.it_data;
- if (req)
- ptlrpc_req_finished(req);
-
- RETURN(rc);
-}
-
-static int ll_special_release(struct inode *inode, struct file *filp)
-{
- return ll_special_release_internal(inode, filp, INODE_OPS);
-}
-
-static int ll_special_file_release(struct inode *inode, struct file *filp)
-{
- return ll_special_release_internal(inode, filp, FILE_OPS);
-}
-
-struct inode_operations ll_special_inode_operations = {
- .setattr_raw = ll_setattr_raw,
- .setattr = ll_setattr,
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- .getattr_it = ll_getattr,
-#else
- .revalidate_it = ll_inode_revalidate_it,
-#endif
- .permission = ll_inode_permission,
- .setxattr = ll_setxattr,
- .getxattr = ll_getxattr,
- .listxattr = ll_listxattr,
- .removexattr = ll_removexattr,
-};
-
-struct file_operations ll_special_chr_inode_fops = {
- .owner = THIS_MODULE,
- .open = ll_special_open,
-};
-
-struct file_operations ll_special_blk_inode_fops = {
- .owner = THIS_MODULE,
- .read = ll_special_read,
- .write = ll_special_write,
- .ioctl = ll_special_ioctl,
- .open = ll_special_open,
- .release = ll_special_release,
- .mmap = ll_special_mmap,
- .llseek = ll_special_seek,
- .fsync = ll_special_fsync,
-};
-
-struct file_operations ll_special_fifo_inode_fops = {
- .owner = THIS_MODULE,
- .open = ll_special_open,
-};
-
-struct file_operations ll_special_sock_inode_fops = {
- .owner = THIS_MODULE,
- .open = ll_special_open
-};
-
-struct file_operations ll_special_chr_file_fops = {
- .owner = THIS_MODULE,
- .llseek = ll_special_file_seek,
- .read = ll_special_file_read,
- .write = ll_special_file_write,
- .poll = ll_special_file_poll,
- .ioctl = ll_special_file_ioctl,
- .open = ll_special_file_open,
- .release = ll_special_file_release,
- .fasync = ll_special_file_fasync,
-};
-
-struct file_operations ll_special_fifo_file_fops = {
- .owner = THIS_MODULE,
- .llseek = ll_special_file_seek,
- .read = ll_special_file_read,
- .write = ll_special_file_write,
- .poll = ll_special_file_poll,
- .ioctl = ll_special_file_ioctl,
- .open = ll_special_file_open,
- .release = ll_special_file_release,
-};
-
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
.revalidate_it = ll_inode_revalidate_it,
#else
- .getattr_it = ll_getattr,
+ .getattr_it = ll_getattr_it,
#endif
.permission = ll_inode_permission,
.setxattr = ll_setxattr,
#define XATTR_USER_T (1)
#define XATTR_TRUSTED_T (2)
#define XATTR_SECURITY_T (3)
-#define XATTR_ACL_T (4)
-#define XATTR_OTHER_T (5)
+#define XATTR_ACL_ACCESS_T (4)
+#define XATTR_ACL_DEFAULT_T (5)
+#define XATTR_OTHER_T (6)
static
int get_xattr_type(const char *name)
{
- if (!strcmp(name, XATTR_NAME_ACL_ACCESS) ||
- !strcmp(name, XATTR_NAME_ACL_DEFAULT))
- return XATTR_ACL_T;
+ if (!strcmp(name, XATTR_NAME_ACL_ACCESS))
+ return XATTR_ACL_ACCESS_T;
+
+ if (!strcmp(name, XATTR_NAME_ACL_DEFAULT))
+ return XATTR_ACL_DEFAULT_T;
if (!strncmp(name, XATTR_USER_PREFIX,
sizeof(XATTR_USER_PREFIX) - 1))
static
int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
{
- if (xattr_type == XATTR_ACL_T && !(sbi->ll_flags & LL_SBI_ACL))
+ if ((xattr_type == XATTR_ACL_ACCESS_T ||
+ xattr_type == XATTR_ACL_DEFAULT_T) &&
+ !(sbi->ll_flags & LL_SBI_ACL))
return -EOPNOTSUPP;
+
if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
return -EOPNOTSUPP;
if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
if (rc)
RETURN(rc);
+ /* posix acl is under protection of LOOKUP lock. when calling to this,
+ * we just have path resolution to the target inode, so we have great
+ * chance that cached ACL is uptodate.
+ */
+ if (xattr_type == XATTR_ACL_ACCESS_T) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct posix_acl *acl;
+
+ spin_lock(&lli->lli_lock);
+ acl = posix_acl_dup(lli->lli_posix_acl);
+ spin_unlock(&lli->lli_lock);
+
+ if (!acl)
+ RETURN(-ENODATA);
+
+ rc = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+ RETURN(rc);
+ }
+
do_getxattr:
ll_inode2fid(&fid, inode);
rc = mdc_getxattr(sbi->ll_mdc_exp, &fid, valid, name, NULL, 0,
for (loi = lsm->lsm_oinfo; stripe < lsm->lsm_stripe_count;
stripe++, loi++) {
kms = lov_size_to_stripe(lsm, size, stripe);
- loi->loi_kms = loi->loi_lvb.lvb_size = kms;
CDEBUG(D_INODE,
"stripe %d KMS %sing "LPU64"->"LPU64"\n",
stripe, kms > loi->loi_kms ? "increas":"shrink",
loi->loi_kms, kms);
+ loi->loi_kms = loi->loi_lvb.lvb_size = kms;
}
RETURN(0);
}
watched->obd_name);
RETURN(-EINVAL);
}
- uuid = &watched->u.cli.cl_import->imp_target_uuid;
+ uuid = &watched->u.cli.cl_target_uuid;
if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
/* Set OSC as active before notifying the observer, so the
if (rc)
GOTO(out, rc);
- rc = obd_set_info(obd->obd_observer->obd_self_export,
- strlen("next_id"),"next_id", 2, params);
+ rc = obd_set_info_async(obd->obd_observer->obd_self_export,
+ strlen("next_id"),"next_id", 2, params, NULL);
if (rc)
GOTO(out, rc);
RETURN(0);
}
-static int lov_precleanup(struct obd_device *obd, int stage)
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
{
int rc = 0;
ENTRY;
}
break;
}
+ case OBD_CLEANUP_EXPORTS:
+ break;
case OBD_CLEANUP_SELF_EXP:
rc = obd_llog_finish(obd, 0);
if (rc != 0)
CERROR("failed to cleanup llogging subsystems\n");
+ break;
+ case OBD_CLEANUP_OBD:
+ break;
}
RETURN(rc);
}
RETURN(rc);
}
-static int lov_set_info(struct obd_export *exp, obd_count keylen,
- void *key, obd_count vallen, void *val)
+static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
+ void *key, obd_count vallen, void *val,
+ struct ptlrpc_request_set *set)
{
struct obd_device *obddev = class_exp2obd(exp);
struct lov_obd *lov = &obddev->u.lov;
int i, rc = 0, err;
+ int no_set = !set;
ENTRY;
+ if (no_set) {
+ set = ptlrpc_prep_set();
+ if (!set)
+ RETURN(-ENOMEM);
+ }
+
if (KEY_IS("next_id")) {
if (vallen != lov->desc.ld_tgt_count)
RETURN(-EINVAL);
continue;
/* hit all OSCs, even inactive ones */
- err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key,
- vallen, ((obd_id*)val) + i);
+ err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen,
+ key, vallen,
+ ((obd_id*)val) + i, set);
if (!rc)
rc = err;
}
if (!lov->tgts[i].ltd_exp || !lov->tgts[i].active)
continue;
- err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key,
- vallen, val);
+ err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen,
+ key, vallen, val, set);
if (!rc)
rc = err;
}
if (!val && !lov->tgts[i].active)
continue;
- err = obd_set_info(lov->tgts[i].ltd_exp,
- keylen, key, vallen, val);
+ err = obd_set_info_async(lov->tgts[i].ltd_exp,
+ keylen, key, vallen, val, set);
if (!rc)
rc = err;
}
out:
lov_putref(obddev);
+ if (no_set) {
+ err = ptlrpc_set_wait(set);
+ if (!rc)
+ rc = err;
+ ptlrpc_set_destroy(set);
+ }
RETURN(rc);
}
.o_join_lru = lov_join_lru,
.o_iocontrol = lov_iocontrol,
.o_get_info = lov_get_info,
- .o_set_info = lov_set_info,
+ .o_set_info_async = lov_set_info_async,
.o_llog_init = lov_llog_init,
.o_llog_finish = lov_llog_finish,
.o_notify = lov_notify,
{
int rc;
- LASSERT_SEM_LOCKED(&inode->i_sem);
-
- if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
- CWARN("setting EA on %lu/%u again... interesting\n",
- inode->i_ino, inode->i_generation);
+ LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
lock_24kernel();
rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
return rc;
}
-/* Must be called with i_sem held */
+/* Must be called with i_mutex held */
static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size,
const char *name)
{
int rc;
- LASSERT_SEM_LOCKED(&inode->i_sem);
+ LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
lock_24kernel();
rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
return EXT_CONTINUE;
}
- tgen = EXT_GENERATION(tree);
+ tgen = EXT_GENERATION(EXT_ROOT_HDR(tree));
count = ext3_ext_calc_credits_for_insert(tree, path);
ext3_up_truncate_sem(inode);
}
ext3_down_truncate_sem(inode);
- if (tgen != EXT_GENERATION(tree)) {
+ if (tgen != EXT_GENERATION(EXT_ROOT_HDR(tree))) {
/* the tree has changed. so path can be invalid at moment */
lock_24kernel();
journal_stop(handle);
CDEBUG(leaked ? D_ERROR : D_INFO,
"obd mem max: %d leaked: %d\n", obd_memmax, leaked);
+ EXIT;
return;
}
up(&lck->rpcl_sem);
}
}
-
-/* Quota stuff */
-extern quota_interface_t *quota_interface;
-
static __u32 mds_pack_open_flags(__u32 flags)
{
return
- (flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC |
+ (flags & (FMODE_READ | FMODE_WRITE |
MDS_OPEN_DELAY_CREATE | MDS_OPEN_HAS_EA |
MDS_OPEN_HAS_OBJS | MDS_OPEN_OWNEROVERRIDE)) |
((flags & O_CREAT) ? MDS_OPEN_CREAT : 0) |
((flags & O_SYNC) ? MDS_OPEN_SYNC : 0) |
((flags & O_DIRECTORY) ? MDS_OPEN_DIRECTORY : 0) |
((flags & O_JOIN_FILE) ? MDS_OPEN_JOIN_FILE : 0) |
+#ifdef FMODE_EXEC
+ ((flags & FMODE_EXEC) ? MDS_FMODE_EXEC : 0) |
+#endif
0;
}
rec->sa_atime = LTIME_S(iattr->ia_atime);
rec->sa_mtime = LTIME_S(iattr->ia_mtime);
rec->sa_ctime = LTIME_S(iattr->ia_ctime);
- rec->sa_attr_flags = iattr->ia_attr_flags;
+ rec->sa_attr_flags =
+ ((struct ll_iattr_struct *)iattr)->ia_attr_flags;
+
if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
rec->sa_suppgid = iattr->ia_gid;
else
repsize[repbufcnt++] = obddev->u.cli.cl_max_mds_cookiesize;
} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
- OBD_MD_FLACL | OBD_MD_FLMODEASIZE;
+ OBD_MD_FLACL | OBD_MD_FLMODEASIZE |
+ OBD_MD_FLDIREA;
size[req_buffers++] = sizeof(struct mds_body);
size[req_buffers++] = data->namelen + 1;
#include <lprocfs_status.h>
#include "mdc_internal.h"
+static quota_interface_t *quota_interface;
+
#define REQUEST_MINOR 244
static int mdc_cleanup(struct obd_device *obd);
EXIT;
*request = req;
out:
+ if (rc != 0 && req && req->rq_commit_cb)
+ req->rq_commit_cb(req);
+
return rc;
}
return rc;
}
-int mdc_set_info(struct obd_export *exp, obd_count keylen,
- void *key, obd_count vallen, void *val)
+int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
+ void *key, obd_count vallen, void *val,
+ struct ptlrpc_request_set *set)
{
struct obd_import *imp = class_exp2cliimp(exp);
int rc = -EINVAL;
RETURN(-ENOMEM);
req->rq_replen = lustre_msg_size(0, NULL);
- rc = ptlrpc_queue_wait(req);
- ptlrpc_req_finished(req);
+ if (set) {
+ rc = 0;
+ ptlrpc_set_add_req(set, req);
+ ptlrpc_check_set(set);
+ } else {
+ rc = ptlrpc_queue_wait(req);
+ ptlrpc_req_finished(req);
+ }
RETURN(rc);
}
RETURN(0);
}
-static int mdc_precleanup(struct obd_device *obd, int stage)
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
{
int rc = 0;
ENTRY;
.o_connect = client_connect_import,
.o_disconnect = client_disconnect_export,
.o_iocontrol = mdc_iocontrol,
- .o_set_info = mdc_set_info,
+ .o_set_info_async = mdc_set_info_async,
.o_get_info = mdc_get_info,
.o_statfs = mdc_statfs,
.o_pin = mdc_pin,
.o_llog_finish = mdc_llog_finish,
};
-static quota_interface_t *quota_interface;
extern quota_interface_t mdc_quota_interface;
int __init mdc_init(void)
#include "mds_internal.h"
+int mds_num_threads;
+CFS_MODULE_PARM(mds_num_threads, "i", int, 0444,
+ "number of MDS service threads to start");
+
static int mds_intent_policy(struct ldlm_namespace *ns,
struct ldlm_lock **lockp, void *req_cookie,
ldlm_mode_t mode, int flags, void *data);
if (inode->i_generation == 0 || inode->i_nlink == 0) {
LCONSOLE_WARN("Found inode with zero generation or link -- this"
- " may indicate disk corruption (inode: %lu, link:"
- " %lu, count: %d)\n", inode->i_ino,
- (unsigned long)inode->i_nlink,
+ " may indicate disk corruption (inode: %lu/%u, "
+ "link %lu, count %d)\n", inode->i_ino,
+ inode->i_generation,(unsigned long)inode->i_nlink,
atomic_read(&inode->i_count));
dput(result);
RETURN(ERR_PTR(-ENOENT));
target_destroy_export(export);
if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
- GOTO(out, 0);
+ RETURN(0);
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
/* Close any open files (which may also cause orphan unlinking). */
}
spin_unlock(&med->med_open_lock);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-out:
mds_client_free(export);
RETURN(rc);
RETURN(0);
}
+/* get the LOV EA from @inode and store it into @md. It can be at most
+ * @size bytes, and @size is updated with the actual EA size.
+ * The EA size is also returned on success, and -ve errno on failure.
+ * If there is no EA then 0 is returned. */
int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
int *size, int lock)
{
int lmm_size;
if (lock)
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
rc = fsfilt_get_md(obd, inode, md, *size, "lov");
if (rc < 0) {
*size = 0;
}
if (lock)
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
RETURN (rc);
}
-/* Call with lock=1 if you want mds_pack_md to take the i_sem.
- * Call with lock=0 if the caller has already taken the i_sem. */
+/* Call with lock=1 if you want mds_pack_md to take the i_mutex.
+ * Call with lock=0 if the caller has already taken the i_mutex. */
int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
struct mds_body *body, struct inode *inode, int lock)
{
{
struct mds_obd *mds = mds_req2mds(req);
struct mds_body *body;
- int rc, size[2] = {sizeof(*body)}, bufcount = 1;
+ int rc, size[3] = {sizeof(*body)}, bufcount = 1;
ENTRY;
body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
(S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0,
"lov");
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
rc, inode->i_ino);
if (rc < 0) {
strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3),
LUSTRE_CFG_BUFLEN(lcfg, 3));
-
}
ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recoverable_clients,
- (obd->obd_recoverable_clients == 1)
- ? "client" : "clients",
+ (obd->obd_recoverable_clients == 1) ?
+ "client" : "clients",
(int)(OBD_RECOVERY_TIMEOUT) / 60,
(int)(OBD_RECOVERY_TIMEOUT) % 60,
obd->obd_name);
}
ldlm_timeout = 2;
- ping_evictor_start();
RETURN(0);
/* There better be a lov */
if (!osc)
RETURN(0);
+
+ if (IS_ERR(osc))
+ RETURN(PTR_ERR(osc));
obd_register_observer(osc, NULL);
return (obd_precleanup(osc, OBD_CLEANUP_EARLY));
}
-static int mds_precleanup(struct obd_device *obd, int stage)
+static int mds_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
{
int rc = 0;
ENTRY;
switch (stage) {
+ case OBD_CLEANUP_EARLY:
+ break;
case OBD_CLEANUP_EXPORTS:
target_cleanup_recovery(obd);
mds_lov_early_clean(obd);
llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
rc = obd_llog_finish(obd, 0);
+ break;
+ case OBD_CLEANUP_OBD:
+ break;
}
RETURN(rc);
}
int must_relock = 0;
ENTRY;
- ping_evictor_stop();
-
if (obd->u.obt.obt_sb == NULL)
RETURN(0);
save_dev = lvfs_sbdev(obd->u.obt.obt_sb);
break;
default:
CERROR("Unhandled intent "LPD64"\n", it->opc);
- LBUG();
+ RETURN(-EFAULT);
}
/* By this point, whatever function we called above must have either
sema_init(&mds->mds_health_sem, 1);
+ if (mds_num_threads < 2)
+ mds_num_threads = MDS_DEF_THREADS;
+ if (mds_num_threads > MDS_MAX_THREADS)
+ mds_num_threads = MDS_MAX_THREADS;
+
mds->mds_service =
ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_MAXREPSIZE, MDS_REQUEST_PORTAL,
MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
mds_handle, LUSTRE_MDS_NAME,
- obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+ obd->obd_proc_entry, NULL, mds_num_threads);
if (!mds->mds_service) {
CERROR("failed to start service\n");
MDS_MAXREPSIZE, MDS_SETATTR_PORTAL,
MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
mds_handle, "mds_setattr",
- obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+ obd->obd_proc_entry, NULL, mds_num_threads);
if (!mds->mds_setattr_service) {
CERROR("failed to start getattr service\n");
GOTO(err_thread, rc = -ENOMEM);
MDS_MAXREPSIZE, MDS_READPAGE_PORTAL,
MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
mds_handle, "mds_readpage",
- obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+ obd->obd_proc_entry, NULL, mds_num_threads);
if (!mds->mds_readpage_service) {
CERROR("failed to start readpage service\n");
GOTO(err_thread2, rc = -ENOMEM);
if (rc)
GOTO(err_thread3, rc);
+ ping_evictor_start();
+
RETURN(0);
err_thread3:
struct mds_obd *mds = &obd->u.mds;
ENTRY;
+ ping_evictor_stop();
+
down(&mds->mds_health_sem);
ptlrpc_unregister_service(mds->mds_readpage_service);
ptlrpc_unregister_service(mds->mds_setattr_service);
struct obd_device *obd = data;
struct mds_obd *mds = &obd->u.mds;
char tmpbuf[sizeof(struct obd_uuid)];
+ struct ptlrpc_request_set *set;
int rc;
sscanf(buffer, "%40s", tmpbuf);
if (strncmp(tmpbuf, "nid:", 4) != 0)
return lprocfs_wr_evict_client(file, buffer, count, data);
- obd_export_evict_by_nid(obd, tmpbuf+4);
+ set = ptlrpc_prep_set();
+ if (!set)
+ return -ENOMEM;
- rc = obd_set_info(mds->mds_osc_exp, strlen("evict_by_nid"),
- "evict_by_nid", strlen(tmpbuf + 4) + 1, tmpbuf + 4);
+ rc = obd_set_info_async(mds->mds_osc_exp, strlen("evict_by_nid"),
+ "evict_by_nid", strlen(tmpbuf + 4) + 1,
+ tmpbuf + 4, set);
if (rc)
CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4,
rc);
+ ptlrpc_check_set(set);
+
+ obd_export_evict_by_nid(obd, tmpbuf+4);
+ rc = ptlrpc_set_wait(set);
+ if (rc)
+ CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4,
+ rc);
+ ptlrpc_set_destroy(set);
return count;
}
GOTO(err_msd, rc);
}
if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) {
- CERROR("OBD UUID %s does not match last_rcvd UUID %s\n",
- obd->obd_uuid.uuid, msd->msd_uuid);
+ LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
+ " disk %s. Were the /dev/ assignments "
+ "rearranged?\n",
+ obd->obd_uuid.uuid, msd->msd_uuid);
GOTO(err_msd, rc = -EINVAL);
}
mount_count = le64_to_cpu(msd->msd_mount_count);
oa->o_generation = filp->f_dentry->d_inode->i_generation;
namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation);
- down(&parent_inode->i_sem);
+ LOCK_INODE_MUTEX(parent_inode);
new_child = lookup_one_len(fidname, mds->mds_objects_dir, namelen);
if (IS_ERR(new_child)) {
out_dput:
dput(new_child);
out_close:
- up(&parent_inode->i_sem);
+ UNLOCK_INODE_MUTEX(parent_inode);
err = filp_close(filp, 0);
if (err) {
CERROR("closing tmpfile %u: rc %d\n", tmpname, rc);
namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation);
- down(&parent_inode->i_sem);
+ LOCK_INODE_MUTEX(parent_inode);
de = lookup_one_len(fidname, mds->mds_objects_dir, namelen);
if (IS_ERR(de)) {
rc = IS_ERR(de);
out_dput:
if (de != NULL)
l_dput(de);
- up(&parent_inode->i_sem);
+ UNLOCK_INODE_MUTEX(parent_inode);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred);
RETURN(rc);
GOTO(cleanup, rc);
}
- down(&head_inode->i_sem);
+ LOCK_INODE_MUTEX(head_inode);
cleanup_phase = 1;
rc = mds_get_md(obd, head_inode, head_lmm, &size, 0);
if (rc < 0)
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
case 1:
- up(&head_inode->i_sem);
+ UNLOCK_INODE_MUTEX(head_inode);
case 0:
if (tail_lmm != NULL)
OBD_FREE(tail_lmm, lmm_size);
LASSERT(mds->mds_lov_objids != NULL);
- rc = obd_set_info(mds->mds_osc_exp, strlen("next_id"), "next_id",
- mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids);
+ rc = obd_set_info_async(mds->mds_osc_exp, strlen("next_id"), "next_id",
+ mds->mds_lov_desc.ld_tgt_count,
+ mds->mds_lov_objids, NULL);
RETURN(rc);
}
OBD_ALLOC(data, sizeof(*data));
if (data == NULL)
RETURN(-ENOMEM);
- data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
+ OBD_CONNECT_REQPORTAL;
data->ocd_version = LUSTRE_VERSION_CODE;
/* NB: lov_connect() needs to fill in .ocd_index for each OST */
rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data);
rc = llog_ioctl(ctxt, cmd, data);
pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count);
- rc2 = obd_set_info(mds->mds_osc_exp, strlen("mds_conn"),
- "mds_conn", 0, NULL);
+ rc2 = obd_set_info_async(mds->mds_osc_exp, strlen("mds_conn"),
+ "mds_conn", 0, NULL, NULL);
if (!rc)
rc = rc2;
RETURN(rc);
LASSERT(obd != NULL);
- rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
- "mds_conn", 0, uuid);
+ rc = obd_set_info_async(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
+ "mds_conn", 0, uuid, NULL);
if (rc != 0)
GOTO(out, rc);
GOTO(out, rc);
}
- CWARN("MDS %s: %s now active, resetting orphans\n",
- obd->obd_name, uuid ? (char *)uuid->uuid : "All OSC's");
+ LCONSOLE_INFO("MDS %s: %s now active, resetting orphans\n",
+ obd->obd_name, uuid ? (char *)uuid->uuid : "All OSCs");
if (obd->obd_stopping)
GOTO(out, rc = -ENODEV);
int mds_lov_synchronize(void *data)
{
- unsigned long flags;
-
- lock_kernel();
- ptlrpc_daemonize();
-
- SIGNAL_MASK_LOCK(current, flags);
- sigfillset(¤t->blocked);
- RECALC_SIGPENDING;
- SIGNAL_MASK_UNLOCK(current, flags);
- unlock_kernel();
+ ptlrpc_daemonize("mds_lov_sync");
return (__mds_lov_syncronize(data));
}
still disconnected. Taking an obd reference insures that we don't
disconnect the LOV. This of course means a cleanup won't
finish for as long as the sync is blocking. */
- atomic_inc(&obd->obd_refcount);
+ class_incref(obd);
if (nonblock) {
/* Syncronize in the background */
RETURN(-EINVAL);
}
- uuid = &watched->u.cli.cl_import->imp_target_uuid;
+ uuid = &watched->u.cli.cl_target_uuid;
if (obd->obd_recovering) {
/* in the case OBD is in recovery we do not reinit desc and
* easize, as that will be done in mds_lov_connect() after
if (error)
GOTO(cleanup_mfd, error);
body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch;
- } else if (flags & FMODE_EXEC) {
+ } else if (flags & MDS_FMODE_EXEC) {
error = mds_deny_write_access(mds, dentry->d_inode);
if (error)
GOTO(cleanup_mfd, error);
return ERR_PTR(error);
}
-/* Must be called with i_sem held */
+/* Must be called with i_mutex held */
static int mds_create_objects(struct ptlrpc_request *req, int offset,
struct mds_update_record *rec,
struct mds_obd *mds, struct obd_device *obd,
res = MAY_READ;
if (flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
res |= MAY_WRITE;
- if (flags & FMODE_EXEC)
+ if (flags & MDS_FMODE_EXEC)
res = MAY_EXEC;
return res;
}
ENTRY;
/* atomically create objects if necessary */
- down(&dchild->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dchild->d_inode);
if (S_ISREG(dchild->d_inode->i_mode) &&
!(body->valid & OBD_MD_FLEASIZE)) {
rc = mds_pack_md(obd, req->rq_repmsg, 2, body,
dchild->d_inode, 0);
if (rc) {
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
RETURN(rc);
}
}
if (rec != NULL) {
if ((body->valid & OBD_MD_FLEASIZE) &&
(rec->ur_flags & MDS_OPEN_HAS_EA)) {
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
RETURN(-EEXIST);
}
if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
rc = mds_join_file(rec, req, dchild, lockh);
if (rc)
RETURN(rc);
- down(&dchild->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dchild->d_inode);
}
if (!(body->valid & OBD_MD_FLEASIZE) &&
!(body->valid & OBD_MD_FLMODEASIZE)) {
dchild, handle, &ids);
if (rc) {
CERROR("mds_create_objects: rc = %d\n", rc);
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
RETURN(rc);
}
}
body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
OBD_MD_FLATIME | OBD_MD_FLMTIME);
}
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE))
lustre_shrink_reply(req, 2, body->eadatasize, 0);
GOTO(cleanup, rc = -EAGAIN);
}
+ if (!S_ISREG(dchild->d_inode->i_mode) &&
+ !S_ISDIR(dchild->d_inode->i_mode) &&
+ (req->rq_export->exp_connect_flags & OBD_CONNECT_NODEVOH)) {
+ /* If client supports this, do not return open handle for
+ * special device nodes */
+ GOTO(cleanup_no_trans, rc = 0);
+ }
+
/* Step 5: mds_open it */
rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec,
rep, &parent_lockh);
}
/* Close a "file descriptor" and possibly unlink an orphan from the
- * PENDING directory. Caller must hold child->i_sem, this drops it.
+ * PENDING directory. Caller must hold child->i_mutex, this drops it.
*
* If we are being called from mds_disconnect() because the client has
* disappeared, then req == NULL and we do not update last_rcvd because
if (mfd->mfd_mode & FMODE_WRITE) {
rc = mds_put_write_access(mds, inode, request_body,
last_orphan && unlink_orphan);
- } else if (mfd->mfd_mode & FMODE_EXEC) {
+ } else if (mfd->mfd_mode & MDS_FMODE_EXEC) {
mds_allow_write_access(inode);
}
/* Sadly, there is no easy way to save pending_child from
* mds_reint_unlink() into mfd, so we need to re-lookup,
* but normally it will still be in the dcache. */
- down(&pending_dir->i_sem);
- cleanup_phase = 1; /* up(&pending_dir->i_sem) when finished */
+ LOCK_INODE_MUTEX(pending_dir);
+ cleanup_phase = 1; /* UNLOCK_INODE_MUTEX(pending_dir) when finished */
pending_child = lookup_one_len(fidname, mds->mds_pending_dir,
fidlen);
if (IS_ERR(pending_child))
case 2:
dput(pending_child);
case 1:
- up(&pending_dir->i_sem);
+ UNLOCK_INODE_MUTEX(pending_dir);
}
RETURN(rc);
}
int log_pri = D_HA;
ENTRY;
+ if (IS_ERR(handle)) {
+ LASSERT(rc != 0);
+ RETURN(rc);
+ }
+
/* if the export has already been failed, we have no last_rcvd slot */
if (req->rq_export->exp_failed) {
CWARN("commit transaction for disconnected client %s: rc %d\n",
RETURN(rc);
}
- if (IS_ERR(handle))
- RETURN(rc);
-
if (handle == NULL) {
/* if we're starting our own xaction, use our own inode */
inode = mds->mds_rcvd_filp->f_dentry->d_inode;
if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
rec->ur_eadata != NULL) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
need_lock = 0;
}
rc = mds_get_md(obd, inode, lmm, &lmm_size, need_lock);
if (rc < 0)
GOTO(cleanup, rc);
+ rc = 0;
handle = fsfilt_start_log(obd, inode, FSFILT_OP_SETATTR, NULL,
le32_to_cpu(lmm->lmm_stripe_count));
rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0);
/* journal chown/chgrp in llog, just like unlink */
if (rc == 0 && lmm_size){
- cookie_size = mds_get_cookie_size(obd, lmm);
+ cookie_size = mds_get_cookie_size(obd, lmm);
OBD_ALLOC(logcookies, cookie_size);
if (logcookies == NULL)
GOTO(cleanup, rc = -ENOMEM);
case 1:
if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
rec->ur_eadata != NULL)
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
l_dput(de);
if (locked) {
if (rc) {
int rdev = rec->ur_rdev;
handle = fsfilt_start(obd, dir, FSFILT_OP_MKNOD, NULL);
if (IS_ERR(handle))
- GOTO(cleanup, (handle = NULL, rc = PTR_ERR(handle)));
+ GOTO(cleanup, rc = PTR_ERR(handle));
rc = vfs_mknod(dir, dchild, rec->ur_mode, rdev);
EXIT;
break;
int lmm_size = sizeof(lmm);
rc = mds_get_md(obd, dir, &lmm, &lmm_size, 1);
if (rc > 0) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
rc = fsfilt_set_md(obd, inode, handle,
&lmm, lmm_size, "lov");
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
}
if (rc)
CERROR("error on copy stripe info: rc = %d\n",
RETURN(0);
}
+static inline int res_eq(struct ldlm_res_id *res1, struct ldlm_res_id *res2)
+{
+ return !memcmp(res1, res2, sizeof(*res1));
+}
+
+static inline void
+try_to_aggregate_locks(struct ldlm_res_id *res1, ldlm_policy_data_t *p1,
+ struct ldlm_res_id *res2, ldlm_policy_data_t *p2)
+{
+ if (!res_eq(res1, res2))
+ return;
+ /* XXX: any additional inodebits (to current LOOKUP and UPDATE)
+ * should be taken with great care here */
+ p1->l_inodebits.bits |= p2->l_inodebits.bits;
+}
+
int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id,
struct lustre_handle *p1_lockh, int p1_lock_mode,
ldlm_policy_data_t *p1_policy,
flags = 0;
if (res_id[i]->name[0] == 0)
break;
- if (i != 0 &&
- memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0 &&
- (policies[i]->l_inodebits.bits &
- policies[i-1]->l_inodebits.bits)) {
+ if (i && res_eq(res_id[i], res_id[i-1])) {
memcpy(dlm_handles[i], dlm_handles[i-1],
sizeof(*(dlm_handles[i])));
ldlm_lock_addref(dlm_handles[i], lock_modes[i]);
} else {
+ /* we need to enqueue locks with different inodebits
+ * at once, because otherwise concurrent thread can
+ * hit the windown between these two locks and we'll
+ * get to deadlock. see bug 10360. note also, that it
+ * is impossible to have >2 equal res. */
+ if (i < 3)
+ try_to_aggregate_locks(res_id[i], policies[i],
+ res_id[i+1], policies[i+1]);
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
*res_id[i], LDLM_IBITS,
policies[i],
child_res_id->name[0] = dchild->d_inode->i_ino;
child_res_id->name[1] = dchild->d_inode->i_generation;
- if (res_gt(parent_res_id, child_res_id, NULL, NULL) ||
- res_gt(maxres, child_res_id, NULL, NULL)) {
+ /* Make sure that we don't try to re-enqueue a lock on the
+ * same resource if it happens that the source is renamed to
+ * the target by another thread (bug 9974, thanks racer :-) */
+ if (!res_gt(child_res_id, parent_res_id, NULL, NULL) ||
+ !res_gt(child_res_id, maxres, NULL, NULL)) {
CDEBUG(D_DLMTRACE, "relock "LPU64"<("LPU64"|"LPU64")\n",
child_res_id->name[0], parent_res_id->name[0],
maxres->name[0]);
if (rc > 0)
goto retry_locks;
if (rc < 0) {
- cleanup_phase = 3;
+ cleanup_phase = 2;
GOTO(cleanup, rc);
}
* part thereof, because we don't have the inode to check for link
* count/open status until after it is locked.
*
- * For lock ordering, caller must get child->i_sem first, then pending->i_sem
- * before starting journal transaction.
+ * For lock ordering, caller must get child->i_mutex first, then
+ * pending->i_mutex before starting journal transaction.
*
* returns 1 on success
* returns 0 if we lost a race and didn't make a new link
LASSERT(inode != NULL);
LASSERT(!mds_inode_is_orphan(inode));
#ifndef HAVE_I_ALLOC_SEM
- LASSERT(down_trylock(&inode->i_sem) != 0);
+ LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
#endif
- LASSERT(down_trylock(&pending_dir->i_sem) != 0);
+ LASSERT(TRYLOCK_INODE_MUTEX(pending_dir) == 0);
fidlen = ll_fid2str(fidname, inode->i_ino, inode->i_generation);
child_inode->i_nlink == 1) {
if (mds_orphan_open_count(child_inode) > 0) {
/* need to lock pending_dir before transaction */
- down(&mds->mds_pending_dir->d_inode->i_sem);
- cleanup_phase = 5; /* up(&pending_dir->i_sem) */
+ LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
+ cleanup_phase = 5; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
} else if (S_ISREG(child_inode->i_mode)) {
mds_pack_inode2fid(&body->fid1, child_inode);
mds_pack_inode2body(body, child_inode);
rc = mds_finish_transno(mds, dparent ? dparent->d_inode : NULL,
handle, req, rc, 0);
if (!rc)
- (void)obd_set_info(mds->mds_osc_exp, strlen("unlinked"),
- "unlinked", 0, NULL);
+ (void)obd_set_info_async(mds->mds_osc_exp, strlen("unlinked"),
+ "unlinked", 0, NULL, NULL);
switch(cleanup_phase) {
case 5: /* pending_dir semaphore */
- up(&mds->mds_pending_dir->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
case 4: /* child inode semaphore */
MDS_UP_READ_ORPHAN_SEM(child_inode);
case 3: /* child ino-reuse lock */
GOTO(cleanup, rc = -EROFS);
handle = fsfilt_start(obd, de_tgt_dir->d_inode, FSFILT_OP_LINK, NULL);
- if (IS_ERR(handle)) {
- rc = PTR_ERR(handle);
- GOTO(cleanup, rc);
- }
+ if (IS_ERR(handle))
+ GOTO(cleanup, rc = PTR_ERR(handle));
rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild);
if (rc && rc != -EPERM && rc != -EACCES)
new_inode->i_nlink == 1) {
if (mds_orphan_open_count(new_inode) > 0) {
/* need to lock pending_dir before transaction */
- down(&mds->mds_pending_dir->d_inode->i_sem);
- cleanup_phase = 4; /* up(&pending_dir->i_sem) */
+ LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
+ cleanup_phase = 4; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
} else if (S_ISREG(new_inode->i_mode)) {
mds_pack_inode2fid(&body->fid1, new_inode);
mds_pack_inode2body(body, new_inode);
switch (cleanup_phase) {
case 4:
- up(&mds->mds_pending_dir->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
case 3:
MDS_UP_READ_ORPHAN_SEM(new_inode);
case 2:
((namlen == 2) && !strcmp(d_name, "..")) || inum == 0)
continue;
- down(&pending_dir->i_sem);
+ LOCK_INODE_MUTEX(pending_dir);
dchild = lookup_one_len(d_name, mds->mds_pending_dir, namlen);
if (IS_ERR(dchild)) {
- up(&pending_dir->i_sem);
+ UNLOCK_INODE_MUTEX(pending_dir);
GOTO(err_out, rc = PTR_ERR(dchild));
}
if (!dchild->d_inode) {
}
next:
l_dput(dchild);
- up(&pending_dir->i_sem);
+ UNLOCK_INODE_MUTEX(pending_dir);
}
rc = 0;
err_out:
return rc;
}
+/*
+ * alwasy return 0, and set req->rq_status as error number in case
+ * of failures.
+ */
static
int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
{
lockpart = MDS_INODELOCK_UPDATE;
- de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX,
- &lockh, NULL, 0, lockpart);
- if (IS_ERR(de))
- GOTO(out, rc = PTR_ERR(de));
-
- inode = de->d_inode;
- LASSERT(inode);
-
- OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
-
+ /* various sanity check for xattr name */
xattr_name = lustre_msg_string(req->rq_reqmsg, 1, 0);
if (!xattr_name) {
CERROR("can't extract xattr name\n");
- GOTO(out_dput, rc = -EPROTO);
+ GOTO(out, rc = -EPROTO);
}
DEBUG_REQ(D_INODE, req, "%sxattr %s\n",
if (strncmp(xattr_name, "trusted.", 8) == 0) {
if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0)
- GOTO(out_dput, rc = -EACCES);
+ GOTO(out, rc = -EACCES);
}
if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) &&
(strncmp(xattr_name, "user.", 5) == 0)) {
- GOTO(out_dput, rc = -EOPNOTSUPP);
+ GOTO(out, rc = -EOPNOTSUPP);
}
+ if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
+ lockpart |= MDS_INODELOCK_LOOKUP;
+
+ de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX,
+ &lockh, NULL, 0, lockpart);
+ if (IS_ERR(de))
+ GOTO(out, rc = PTR_ERR(de));
+
+ inode = de->d_inode;
+ LASSERT(inode);
+
+ OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
+
/* filter_op simply use setattr one */
handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
if (IS_ERR(handle))
xattr = lustre_msg_buf(req->rq_reqmsg, 2,
xattrlen);
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
lock_24kernel();
rc = inode->i_op->setxattr(de, xattr_name, xattr,
xattrlen, body->flags);
unlock_24kernel();
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
}
} else if (body->valid & OBD_MD_FLXATTRRM) {
if (inode->i_op && inode->i_op->removexattr) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
lock_24kernel();
rc = inode->i_op->removexattr(de, xattr_name);
unlock_24kernel();
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
}
} else {
CERROR("valid bits: "LPX64"\n", body->valid);
unsigned int ldlm_timeout = 20; /* seconds */
unsigned int obd_health_check_timeout = 120; /* seconds */
char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall */
-unsigned int obd_sync_filter; /* = 0, don't sync by default */
cfs_waitq_t obd_race_waitq;
EXPORT_SYMBOL(ldlm_timeout);
EXPORT_SYMBOL(obd_health_check_timeout);
EXPORT_SYMBOL(obd_lustre_upcall);
-EXPORT_SYMBOL(obd_sync_filter);
EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
-EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack);
EXPORT_SYMBOL(proc_lustre_root);
EXPORT_SYMBOL(class_handle2object);
/* config.c */
+EXPORT_SYMBOL(class_incref);
EXPORT_SYMBOL(class_decref);
EXPORT_SYMBOL(class_get_profile);
EXPORT_SYMBOL(class_del_profile);
/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this
* ifdef to the end of the file to cover module and versioning goo.*/
#ifdef __KERNEL__
-
static void cleanup_obdclass(void)
{
int i;
- int leaked;
ENTRY;
cfs_psdev_deregister(&obd_psdev);
class_handle_cleanup();
class_exit_uuidlist();
-
- leaked = atomic_read(&obd_memory);
- CDEBUG(leaked ? D_ERROR : D_INFO,
- "obd mem max: %d leaked: %d\n", obd_memmax, leaked);
-
EXIT;
}
SYSCTL_INT(_lustre, OID_AUTO, memused,
CTLTYPE_INT | CTLFLAG_RW, (int *)&obd_memory.counter,
0, "lustre_memory_used");
-SYSCTL_INT(_lustre, OID_AUTO, filter_sync_on_commit,
- CTLTYPE_INT | CTLFLAG_RW, &obd_sync_filter,
- 0, "filter_sync_on_commit");
SYSCTL_INT(_lustre, OID_AUTO, ldlm_timeout,
CTLTYPE_INT | CTLFLAG_RW, &ldlm_timeout,
0, "ldlm_timeout");
cfs_mem_cache_t *import_cachep = NULL;
int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
/*
* support functions: we could use inter-module communication, but this
obd->obd_minor = i;
obd->obd_type = type;
obd->obd_name = name;
- CDEBUG(D_IOCTL, "Adding new device %s\n",
- obd->obd_name);
+ CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+ obd->obd_name, obd);
result = obd;
}
}
continue;
if ((strncmp(obd->obd_type->typ_name, typ_name,
strlen(typ_name)) == 0)) {
- struct client_obd *cli = &obd->u.cli;
- struct obd_import *imp = cli->cl_import;
- if (obd_uuid_equals(tgt_uuid, &imp->imp_target_uuid) &&
+ if (obd_uuid_equals(tgt_uuid,
+ &obd->u.cli.cl_target_uuid) &&
((grp_uuid)? obd_uuid_equals(grp_uuid,
&obd->obd_uuid) : 1)) {
spin_unlock(&obd_dev_lock);
}
}
LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */
- atomic_inc(&obd->obd_refcount);
+ class_incref(obd);
list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
list_add_tail(&export->exp_obd_chain_timed,
&export->exp_obd->obd_exports_timed);
}
LASSERT(list_empty(&import->imp_handle.h_link));
+ class_decref(import->imp_obd);
OBD_FREE(import, sizeof(*import));
EXIT;
}
EXPORT_SYMBOL(class_import_put);
-struct obd_import *class_new_import(void)
+struct obd_import *class_new_import(struct obd_device *obd)
{
struct obd_import *imp;
CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
CFS_INIT_LIST_HEAD(&imp->imp_delayed_list);
spin_lock_init(&imp->imp_lock);
- imp->imp_conn_cnt = 0;
- imp->imp_max_transno = 0;
- imp->imp_peer_committed_transno = 0;
imp->imp_state = LUSTRE_IMP_NEW;
+ imp->imp_obd = class_incref(obd);
cfs_waitq_init(&imp->imp_recovery_waitq);
atomic_set(&imp->imp_refcount, 2);
class_handle_unhash(&import->imp_handle);
- /* Abort any inflight DLM requests and NULL out their (about to be
- * freed) import. */
- /* Invalidate all requests on import, would be better to call
- ptlrpc_set_import_active(imp, 0); */
import->imp_generation++;
- ptlrpc_abort_inflight_superhack(import);
-
class_import_put(import);
}
EXPORT_SYMBOL(class_destroy_import);
/* It's possible that an export may disconnect itself, but
* nothing else will be added to this list. */
- while(!list_empty(list)) {
+ while (!list_empty(list)) {
exp = list_entry(list->next, struct obd_export, exp_obd_chain);
class_export_get(exp);
exp->exp_flags = flags;
}
EXPORT_SYMBOL(obd_export_nid2str);
-/* Ping evictor thread */
-#ifdef __KERNEL__
-#define PET_READY 1
-#define PET_TERMINATE 2
-
-static int pet_refcount = 0;
-static int pet_state;
-static cfs_waitq_t pet_waitq;
-static struct obd_export *pet_exp = NULL;
-static spinlock_t pet_lock;
-
-static int ping_evictor_wake(struct obd_export *exp)
-{
- spin_lock(&pet_lock);
- if (pet_exp) {
- /* eventually the new obd will call here again. */
- spin_unlock(&pet_lock);
- return 1;
- }
-
- /* We have to make sure the obd isn't destroyed between now and when
- * the ping evictor runs. We'll take a reference here, and drop it
- * when we finish in the evictor. We don't really care about this
- * export in particular; we just need one to keep the obd alive. */
- pet_exp = class_export_get(exp);
- spin_unlock(&pet_lock);
-
- cfs_waitq_signal(&pet_waitq);
- return 0;
-}
-
-static int ping_evictor_main(void *arg)
-{
- struct obd_device *obd;
- struct obd_export *exp;
- struct l_wait_info lwi = { 0 };
- time_t expire_time;
- ENTRY;
-
- lock_kernel();
-
- /* ptlrpc_daemonize() */
- exit_mm(current);
- lustre_daemonize_helper();
- set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
- exit_files(current);
- reparent_to_init();
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX-1, "ping_evictor");
-
- cfs_block_allsigs();
- unlock_kernel();
-
- CDEBUG(D_HA, "Starting Ping Evictor\n");
- pet_exp = NULL;
- pet_state = PET_READY;
- while (1) {
- l_wait_event(pet_waitq, pet_exp ||
- (pet_state == PET_TERMINATE), &lwi);
- if (pet_state == PET_TERMINATE)
- break;
-
- /* we only get here if pet_exp != NULL, and the end of this
- * loop is the only place which sets it NULL again, so lock
- * is not strictly necessary. */
- spin_lock(&pet_lock);
- obd = pet_exp->exp_obd;
- spin_unlock(&pet_lock);
-
- expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
-
- CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
- obd->obd_name, expire_time);
-
- /* Exports can't be deleted out of the list while we hold
- * the obd lock (class_unlink_export), which means we can't
- * lose the last ref on the export. If they've already been
- * removed from the list, we won't find them here. */
- spin_lock(&obd->obd_dev_lock);
- while (!list_empty(&obd->obd_exports_timed)) {
- exp = list_entry(obd->obd_exports_timed.next,
- struct obd_export,exp_obd_chain_timed);
-
- if (expire_time > exp->exp_last_request_time) {
- class_export_get(exp);
- spin_unlock(&obd->obd_dev_lock);
- LCONSOLE_WARN("%s: haven't heard from %s in %ld"
- " seconds. Last request was at %ld. "
- "I think it's dead, and I am evicting "
- "it.\n", obd->obd_name,
- obd_export_nid2str(exp),
- (long)(CURRENT_SECONDS -
- exp->exp_last_request_time),
- exp->exp_last_request_time);
-
-
- class_fail_export(exp);
- class_export_put(exp);
-
- spin_lock(&obd->obd_dev_lock);
- } else {
- /* List is sorted, so everyone below is ok */
- break;
- }
- }
- spin_unlock(&obd->obd_dev_lock);
-
- class_export_put(pet_exp);
-
- spin_lock(&pet_lock);
- pet_exp = NULL;
- spin_unlock(&pet_lock);
- }
- CDEBUG(D_HA, "Exiting Ping Evictor\n");
-
- RETURN(0);
-}
-
-void ping_evictor_start(void)
-{
- int rc;
-
- if (++pet_refcount > 1)
- return;
-
- spin_lock_init(&pet_lock);
- cfs_waitq_init(&pet_waitq);
-
- rc = cfs_kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
- if (rc < 0) {
- pet_refcount--;
- CERROR("Cannot start ping evictor thread: %d\n", rc);
- }
-}
-EXPORT_SYMBOL(ping_evictor_start);
-
-void ping_evictor_stop(void)
-{
- if (--pet_refcount > 0)
- return;
-
- pet_state = PET_TERMINATE;
- cfs_waitq_signal(&pet_waitq);
-}
-EXPORT_SYMBOL(ping_evictor_stop);
-#else /* !__KERNEL__ */
-#define ping_evictor_wake(exp) 1
-#endif
-
-/* This function makes sure dead exports are evicted in a timely manner.
- This function is only called when some export receives a message (i.e.,
- the network is up.) */
-void class_update_export_timer(struct obd_export *exp, time_t extra_delay)
-{
- struct obd_export *oldest_exp;
- time_t oldest_time;
-
- ENTRY;
-
- LASSERT(exp);
-
- /* Compensate for slow machines, etc, by faking our request time
- into the future. Although this can break the strict time-ordering
- of the list, we can be really lazy here - we don't have to evict
- at the exact right moment. Eventually, all silent exports
- will make it to the top of the list. */
- exp->exp_last_request_time = max(exp->exp_last_request_time,
- (time_t)CURRENT_SECONDS + extra_delay);
-
- CDEBUG(D_INFO, "updating export %s at %ld\n",
- exp->exp_client_uuid.uuid,
- exp->exp_last_request_time);
-
- /* exports may get disconnected from the chain even though the
- export has references, so we must keep the spin lock while
- manipulating the lists */
- spin_lock(&exp->exp_obd->obd_dev_lock);
-
- if (list_empty(&exp->exp_obd_chain_timed)) {
- /* this one is not timed */
- spin_unlock(&exp->exp_obd->obd_dev_lock);
- EXIT;
- return;
- }
-
- list_move_tail(&exp->exp_obd_chain_timed,
- &exp->exp_obd->obd_exports_timed);
-
- oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
- struct obd_export, exp_obd_chain_timed);
- oldest_time = oldest_exp->exp_last_request_time;
- spin_unlock(&exp->exp_obd->obd_dev_lock);
-
- if (exp->exp_obd->obd_recovering) {
- /* be nice to everyone during recovery */
- EXIT;
- return;
- }
-
- /* Note - racing to start/reset the obd_eviction timer is safe */
- if (exp->exp_obd->obd_eviction_timer == 0) {
- /* Check if the oldest entry is expired. */
- if (CURRENT_SECONDS > (oldest_time +
- (3 * obd_timeout / 2) + extra_delay)) {
- /* We need a second timer, in case the net was down and
- * it just came back. Since the pinger may skip every
- * other PING_INTERVAL (see note in ptlrpc_pinger_main),
- * we better wait for 3. */
- exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
- 3 * PING_INTERVAL;
- CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
- exp->exp_obd->obd_name, obd_export_nid2str(exp),
- oldest_time);
- }
- } else {
- if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
- extra_delay)) {
- /* The evictor won't evict anyone who we've heard from
- * recently, so we don't have to check before we start
- * it. */
- if (!ping_evictor_wake(exp))
- exp->exp_obd->obd_eviction_timer = 0;
- }
- }
-
- EXIT;
-}
-EXPORT_SYMBOL(class_update_export_timer);
-
#define EVICT_BATCH 32
int obd_export_evict_by_nid(struct obd_device *obd, char *nid)
{
if (obd->obd_type == NULL)
continue;
- atomic_inc(&obd->obd_refcount);
+ class_incref(obd);
spin_unlock(&obd_dev_lock);
if (obd_health_check(obd)) {
attr->ia_gid = oa->o_gid;
attr->ia_valid |= ATTR_GID;
}
-
- if (valid & OBD_MD_FLFLAGS) {
- attr->ia_attr_flags = oa->o_flags;
- attr->ia_valid |= ATTR_ATTR_FLAG;
- }
}
EXPORT_SYMBOL(iattr_from_obdo);
LTIME_S(dst->i_ctime) = src->o_ctime;
if (valid & OBD_MD_FLSIZE)
dst->i_size = src->o_size;
- if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+ if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
dst->i_blocks = src->o_blocks;
+ if (dst->i_blocks < src->o_blocks) /* overflow */
+ dst->i_blocks = -1;
+
+ }
if (valid & OBD_MD_FLBLKSZ)
dst->i_blksize = src->o_blksize;
if (valid & OBD_MD_FLTYPE)
&proc_dostring, &sysctl_string },
{OBD_MEMUSED, "memused", (int *)&obd_memory.counter,
sizeof(int), 0644, NULL, &proc_dointvec},
- {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int),
- 0644, NULL, &proc_dointvec},
{OBD_LDLM_TIMEOUT, "ldlm_timeout", &ldlm_timeout, sizeof(int), 0644,
NULL, &proc_set_timeout},
{ 0 }
rc = llog_lvfs_close(handle);
if (rc == 0) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
rc = vfs_unlink(inode, fdentry);
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
}
dput(fdentry);
obd->obd_llog_ctxt[index] = ctxt;
ctxt->loc_obd = obd;
- ctxt->loc_exp = disk_obd->obd_self_export;
+ ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
ctxt->loc_idx = index;
ctxt->loc_logops = op;
sema_init(&ctxt->loc_sem, 1);
rc = CTXTP(ctxt, cleanup)(ctxt);
ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL;
+ if (ctxt->loc_exp)
+ class_export_put(ctxt->loc_exp);
OBD_FREE(ctxt, sizeof(*ctxt));
RETURN(rc);
imp_state_name = ptlrpc_import_state_name(imp->imp_state);
*eof = 1;
return snprintf(page, count, "%s\t%s%s\n",
- imp->imp_target_uuid.uuid, imp_state_name,
+ obd2cli_tgt(obd), imp_state_name,
imp->imp_deactive ? "\tDEACTIVATED" : "");
}
"initial_transno",
"inode_bit_locks",
"join_file",
+ "",
+ "no_oh_for_devices",
NULL
};
LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
- LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
CFS_INIT_LIST_HEAD(&obd->obd_exports);
CFS_INIT_LIST_HEAD(&obd->obd_exports_timed);
- obd->obd_num_exports = 0;
spin_lock_init(&obd->obd_dev_lock);
spin_lock_init(&obd->obd_osfs_lock);
obd->obd_osfs_age = cfs_time_shift(-1000);
obd->obd_attached = 1;
type->typ_refcnt++;
- CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n",
- obd->obd_minor, typename);
+ CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+ obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
RETURN(0);
out:
switch (cleanup_phase) {
obd->obd_set_up = 1;
spin_lock(&obd->obd_dev_lock);
/* cleanup drops this */
- atomic_inc(&obd->obd_refcount);
+ class_incref(obd);
spin_unlock(&obd->obd_dev_lock);
CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
RETURN(err);
}
+struct obd_device *class_incref(struct obd_device *obd)
+{
+ atomic_inc(&obd->obd_refcount);
+ CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+ atomic_read(&obd->obd_refcount));
+
+ return obd;
+}
+
void class_decref(struct obd_device *obd)
{
int err;
refs = atomic_read(&obd->obd_refcount);
spin_unlock(&obd->obd_dev_lock);
- CDEBUG(D_INFO, "Decref %s now %d\n", obd->obd_name, refs);
+ CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
if ((refs == 1) && obd->obd_stopping) {
/* All exports (other than the self-export) have been
spin_lock (&ec->ec_lock);
eco = echo_find_object_locked (obd, oa->o_id);
if (eco != NULL) {
- if (eco->eco_deleted) { /* being deleted */
- spin_unlock(&ec->ec_lock); /* (see comment in cleanup) */
+ if (eco->eco_deleted) { /* being deleted */
+ spin_unlock(&ec->ec_lock);/* (see comment in cleanup) */
return (-EAGAIN);
}
-
+
eco->eco_refcount++;
spin_unlock (&ec->ec_lock);
*ecop = eco;
lustre_cfg_string(lcfg, 1));
return -ENOMEM;
}
-
+
+ ocd->ocd_connect_flags = OBD_CONNECT_VERSION;
ocd->ocd_version = LUSTRE_VERSION_CODE;
rc = obd_connect(&conn, tgt, &echo_uuid, ocd);
*/
/*
- * Invariant: Get O/R i_sem for lookup, if needed, before any journal ops
+ * Invariant: Get O/R i_mutex for lookup, if needed, before any journal ops
* (which need to get journal_lock, may block if journal full).
*
* Invariant: Call filter_start_transno() before any journal ops to avoid the
* same deadlock problem. We can (and want) to get rid of the
- * transno sem in favour of the dir/inode i_sem to avoid single
+ * transno sem in favour of the dir/inode i_mutex to avoid single
* threaded operation on the OST.
*/
GOTO(err_fsd, rc);
}
if (strcmp(fsd->fsd_uuid, obd->obd_uuid.uuid) != 0) {
- CERROR("OBD UUID %s does not match last_rcvd UUID %s\n",
- obd->obd_uuid.uuid, fsd->fsd_uuid);
+ LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
+ " disk %s. Were the /dev/ assignments "
+ "rearranged?\n",
+ obd->obd_uuid.uuid, fsd->fsd_uuid);
GOTO(err_fsd, rc = -EINVAL);
}
mount_count = le64_to_cpu(fsd->fsd_mount_count);
GOTO(cleanup_O0, rc = -EEXIST);
}
- down(&O_dentry->d_inode->i_sem);
+ LOCK_INODE_MUTEX(O_dentry->d_inode);
rc = vfs_rename(O_dentry->d_inode, dentry,
O_dentry->d_inode, O0_dentry);
- up(&O_dentry->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(O_dentry->d_inode);
if (rc) {
CERROR("error renaming O/R to O/0: rc %d\n", rc);
static int filter_lock_dentry(struct obd_device *obd, struct dentry *dparent)
{
- down(&dparent->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dparent->d_inode);
return 0;
}
/* We never dget the object parent, so DON'T dput it either */
static void filter_parent_unlock(struct dentry *dparent)
{
- up(&dparent->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dparent->d_inode);
}
/* How to get files, dentries, inodes from object id's.
ENTRY;
/* don't need dir->i_zombie for 2.4, it is for rename/unlink of dir
- * itself we already hold dir->i_sem for child create/unlink ops */
- LASSERT(down_trylock(&dir->i_sem) != 0);
- LASSERT(down_trylock(&dentry->d_inode->i_sem) != 0);
+ * itself we already hold dir->i_mutex for child create/unlink ops */
+ LASSERT(TRYLOCK_INODE_MUTEX(dir) == 0);
+ LASSERT(TRYLOCK_INODE_MUTEX(dentry->d_inode) == 0);
+
/* may_delete() */
if (!dentry->d_inode || dentry->d_parent->d_inode != dir)
IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode))
GOTO(out, rc = -EPERM);
- /* NOTE: This might need to go outside i_sem, though it isn't clear if
+ /* NOTE: This might need to go outside i_mutex, though it isn't clear if
* that was done because of journal_start (which is already done
* here) or some other ordering issue. */
DQUOT_INIT(dir);
rc = dir->i_op->unlink(dir, dentry);
out:
- /* need to drop i_sem before we lose inode reference */
- up(&dentry->d_inode->i_sem);
+ /* need to drop i_mutex before we lose inode reference */
+ UNLOCK_INODE_MUTEX(dentry->d_inode);
if (rc == 0)
d_delete(dentry);
}
/* Caller must hold LCK_PW on parent and push us into kernel context.
- * Caller must hold child i_sem, we drop it always.
+ * Caller must hold child i_mutex, we drop it always.
* Caller is also required to ensure that dchild->d_inode exists. */
static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
struct dentry *dparent,
LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
obd->obd_replayable = 1;
- obd_sync_filter = 1;
if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
str = lustre_cfg_string(lcfg, 3);
if (strchr(str, 'n')) {
CWARN("%s: recovery disabled\n", obd->obd_name);
obd->obd_replayable = 0;
- obd_sync_filter = 0;
}
}
lproc_filter_attach_seqstat(obd);
}
- ping_evictor_start();
-
return rc;
}
RETURN(rc);
}
-static int filter_precleanup(struct obd_device *obd, int stage)
+static int filter_precleanup(struct obd_device *obd,
+ enum obd_cleanup_stage stage)
{
int rc = 0;
ENTRY;
switch(stage) {
+ case OBD_CLEANUP_EARLY:
+ break;
case OBD_CLEANUP_EXPORTS:
target_cleanup_recovery(obd);
break;
case OBD_CLEANUP_SELF_EXP:
rc = filter_llog_finish(obd, 0);
+ break;
+ case OBD_CLEANUP_OBD:
+ break;
}
RETURN(rc);
}
}
}
- ping_evictor_stop();
-
lquota_cleanup(quota_interface, obd);
ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
target_destroy_export(exp);
+ if (obd_uuid_equals(&exp->exp_client_uuid, &exp->exp_obd->obd_uuid))
+ RETURN(0);
+
if (exp->exp_obd->obd_replayable)
filter_client_free(exp);
else
}
if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) {
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
locked = 1;
}
}
if (locked) {
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
locked = 0;
}
EXIT;
out_unlock:
if (locked)
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
/* trigger quota release */
if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
unsigned int qcids[MAXQUOTAS] = {0, 0};
struct obd_device *obd;
struct filter_obd *filter;
- struct dentry *dchild = NULL, *dparent;
+ struct dentry *dchild = NULL, *dparent = NULL;
struct lvfs_run_ctxt saved;
void *handle = NULL;
struct llog_cookie *fcc = NULL;
* restart transaction
* (see BUG 4180) -bzzz
*/
- down(&dchild->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dchild->d_inode);
handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR,
NULL, 1);
if (IS_ERR(handle)) {
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
GOTO(cleanup, rc = PTR_ERR(handle));
}
iattr.ia_size = 0;
rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1);
rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0);
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
if (rc)
GOTO(cleanup, rc);
if (rc2)
GOTO(cleanup, rc = PTR_ERR(dparent));
cleanup_phase = 3; /* filter_parent_unlock */
- down(&dchild->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dchild->d_inode);
handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
if (IS_ERR(handle)) {
- up(&dchild->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dchild->d_inode);
GOTO(cleanup, rc = PTR_ERR(handle));
}
cleanup_phase = 4; /* fsfilt_commit */
/* Quota release need uid/gid of inode */
obdo_from_inode(oa, dchild->d_inode, OBD_MD_FLUID|OBD_MD_FLGID);
- /* this drops dchild->d_inode->i_sem unconditionally */
+ /* this drops dchild->d_inode->i_mutex unconditionally */
rc = filter_destroy_internal(obd, oa->o_id, dparent, dchild);
EXIT;
push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
- down(&dentry->d_inode->i_sem);
+ LOCK_INODE_MUTEX(dentry->d_inode);
+
rc = filemap_fdatawrite(dentry->d_inode->i_mapping);
if (rc == 0) {
/* just any file to grab fsync method - "file" arg unused */
if (!rc)
rc = rc2;
}
- up(&dentry->d_inode->i_sem);
+ UNLOCK_INODE_MUTEX(dentry->d_inode);
oa->o_valid = OBD_MD_FLID;
obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS);
RETURN(-EINVAL);
}
-static int filter_set_info(struct obd_export *exp, __u32 keylen,
- void *key, __u32 vallen, void *val)
+static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
+ void *key, __u32 vallen, void *val,
+ struct ptlrpc_request_set *set)
{
struct obd_device *obd;
struct llog_ctxt *ctxt;
static struct obd_ops filter_obd_ops = {
.o_owner = THIS_MODULE,
.o_get_info = filter_get_info,
- .o_set_info = filter_set_info,
+ .o_set_info_async = filter_set_info_async,
.o_setup = filter_setup,
.o_precleanup = filter_precleanup,
.o_cleanup = filter_cleanup,
static struct obd_ops filter_sanobd_ops = {
.o_owner = THIS_MODULE,
.o_get_info = filter_get_info,
- .o_set_info = filter_set_info,
+ .o_set_info_async = filter_set_info_async,
.o_setup = filter_san_setup,
.o_precleanup = filter_precleanup,
.o_cleanup = filter_cleanup,
CERROR("Failure to commit OST transaction (%d)?\n", err);
rc = err;
}
- if (obd_sync_filter && !err)
+ if (obd->obd_replayable && !err)
LASSERTF(oti->oti_transno <= obd->obd_last_committed,
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
rc = generic_osync_inode(inode, inode->i_mapping,
OSYNC_DATA|OSYNC_METADATA);
*/
+ down(&inode->i_sem);
+ current->flags |= PF_SYNCWRITE;
rc = filemap_fdatawrite(inode->i_mapping);
rc2 = sync_mapping_buffers(inode->i_mapping);
if (rc == 0)
rc = rc2;
rc2 = filemap_fdatawait(inode->i_mapping);
+ current->flags &= ~PF_SYNCWRITE;
+ up(&inode->i_sem);
if (rc == 0)
rc = rc2;
if (rc != 0)
return 0;
}
-/* Must be called with i_sem taken for writes; this will drop it */
+/* Must be called with i_mutex taken for writes; this will drop it */
int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
struct obd_export *exp, struct iattr *attr,
struct obd_trans_info *oti, void **wait_handle)
oti->oti_handle, attr, 0);
}
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
rc2 = filter_finish_transno(exp, oti, 0);
if (rc2 != 0) {
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
cleanup_phase = 2;
- down(&inode->i_sem);
- fsfilt_check_slow(now, obd_timeout, "i_sem");
+ LOCK_INODE_MUTEX(inode);
+ fsfilt_check_slow(now, obd_timeout, "i_mutex");
oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
oti);
if (IS_ERR(oti->oti_handle)) {
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
rc = PTR_ERR(oti->oti_handle);
CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
"error starting transaction: rc = %d\n", rc);
rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
}
- /* filter_direct_io drops i_sem */
+ /* filter_direct_io drops i_mutex */
rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
oti, &wait_handle);
if (rc == 0)
if (err)
rc = err;
- if (obd_sync_filter && !err)
+ if (obd->obd_replayable && !err)
LASSERTF(oti->oti_transno <= obd->obd_last_committed,
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
struct ost_filterdata *ofd;
ENTRY;
- down(&inode->i_sem);
+ LOCK_INODE_MUTEX(inode);
ofd = inode->i_filterdata;
if (ofd && ofd->ofd_epoch >= io_epoch) {
if (ofd->ofd_epoch > io_epoch)
CERROR("client sent old epoch %d for obj ino %ld\n",
io_epoch, inode->i_ino);
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
RETURN(0);
}
ofd->ofd_epoch = io_epoch;
}
/* the decision to write a record is now made, unlock */
- up(&inode->i_sem);
+ UNLOCK_INODE_MUTEX(inode);
OBD_ALLOC(lsc, sizeof(*lsc));
if (lsc == NULL)
ENTRY;
LASSERT(res);
- LASSERT(down_trylock(&res->lr_lvb_sem) != 0);
+ LASSERT_SEM_LOCKED(&res->lr_lvb_sem);
/* we only want lvb's for object resources */
/* check for internal locks: these have name[1] != 0 */
{
struct obd_device *dev = data;
struct client_obd *cli = &dev->u.cli;
+ struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
int val, rc;
rc = lprocfs_write_helper(buffer, count, &val);
if (val < 1 || val > OSC_MAX_RIF_MAX)
return -ERANGE;
- if (cli->cl_rq_pool && val > cli->cl_max_rpcs_in_flight)
- cli->cl_rq_pool->prp_populate(cli->cl_rq_pool,
- val - cli->cl_max_rpcs_in_flight);
+ if (pool && val > cli->cl_max_rpcs_in_flight)
+ pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
client_obd_list_lock(&cli->cl_loi_list_lock);
cli->cl_max_rpcs_in_flight = val;
spin_unlock(&oscc->oscc_lock);
DEBUG_REQ(D_ERROR, req,
"unknown rc %d from async create: failing oscc", rc);
- ptlrpc_fail_import(req->rq_import, req->rq_import_generation);
+ ptlrpc_fail_import(req->rq_import, req->rq_reqmsg->conn_cnt);
} else {
if (rc == 0) {
oscc->oscc_flags &= ~OSCC_FLAG_LOW;
if (rc == 0)
CDEBUG(D_HA, "%s: returning objid "LPU64"\n",
- oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid,
- lsm->lsm_object_id);
+ obd2cli_tgt(oscc->oscc_obd), lsm->lsm_object_id);
else if (*ea == NULL)
obd_free_memmd(exp, &lsm);
RETURN(rc);
ENTRY;
opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ;
- pool = ((cmd & OBD_BRW_WRITE) != 0) ? cli->cl_rq_pool : NULL;
+ pool = ((cmd & OBD_BRW_WRITE) != 0) ? imp->imp_rq_pool : NULL;
for (niocount = i = 1; i < page_count; i++)
if (!can_merge_pages(&pga[i - 1], &pga[i]))
"i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
" prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
i, page_count,
- pg->pg, pg->pg->private, pg->pg->index, pg->off,
- pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
- pg_prev->off);
+ pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+ pg_prev->pg, page_private(pg_prev->pg),
+ pg_prev->pg->index, pg_prev->off);
#else
LASSERTF(i == 0 || pg->off > pg_prev->off,
"i %d p_c %u\n", i, page_count);
*oa = *saved_oa;
} else if (page_count > pages_per_brw) {
/* save a copy of oa (brw will clobber it) */
- OBD_ALLOC(saved_oa, sizeof(*saved_oa));
- if (saved_oa == NULL) {
- CERROR("Can't save oa (ENOMEM)\n");
+ saved_oa = obdo_alloc();
+ if (saved_oa == NULL)
RETURN(-ENOMEM);
- }
*saved_oa = *oa;
}
-
+
rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, pga);
if (rc != 0)
}
if (saved_oa != NULL)
- OBD_FREE(saved_oa, sizeof(*saved_oa));
+ obdo_free(saved_oa);
RETURN(rc);
}
GOTO(unlock, 0);
}
- /* we don't get interruption callbacks until osc_trigger_sync_io()
+ /* we don't get interruption callbacks until osc_trigger_group_io()
* has been called and put the sync oaps in the pending/urgent lists.*/
if (!list_empty(&oap->oap_pending_item)) {
list_del_init(&oap->oap_pending_item);
- if (oap->oap_async_flags & ASYNC_URGENT)
- list_del_init(&oap->oap_urgent_item);
+ list_del_init(&oap->oap_urgent_item);
loi = oap->oap_loi;
lop = (oap->oap_cmd & OBD_BRW_WRITE) ?
oap = list_entry(pos, struct osc_async_page, oap_pending_item);
list_del(&oap->oap_pending_item);
list_add_tail(&oap->oap_pending_item, &lop->lop_pending);
- list_add(&oap->oap_urgent_item, &lop->lop_urgent);
+ if (oap->oap_async_flags & ASYNC_URGENT)
+ list_add(&oap->oap_urgent_item, &lop->lop_urgent);
lop_update_pending(cli, lop, cmd, 1);
}
loi_list_maint(cli, loi);
struct lov_stripe_md *lsm, obd_count page_count,
struct brw_page *pga)
{
- struct client_obd *cli = &exp->exp_obd->u.cli;
struct ptlrpc_request *request = NULL;
struct ost_body *body;
struct niobuf_remote *nioptr;
request = ptlrpc_prep_req_pool(class_exp2cliimp(exp),
LUSTRE_OST_VERSION, OST_SAN_WRITE,
- 3, size, NULL, cli->cl_rq_pool);
+ 3, size, NULL, imp->imp_rq_pool);
if (!request)
RETURN(-ENOMEM);
goto no_match;
/* Next, search for already existing extent locks that will cover us */
- rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode,
- lockh);
+ rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy,
+ mode, lockh);
if (rc == 1) {
osc_set_data_with_check(lockh, data, *flags);
if (*flags & LDLM_FL_HAS_INTENT) {
* locks out from other users right now, too. */
if (mode == LCK_PR) {
- rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type,
+ rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
policy, LCK_PW, lockh);
if (rc == 1) {
/* FIXME: This is not incredibly elegant, but it might
req->rq_replen = lustre_msg_size(2, size);
}
+ /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+ *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type,
policy, mode, flags, bl_cb, cp_cb, gl_cb, data,
&lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh);
RETURN(-EINVAL);
}
-static int osc_set_info(struct obd_export *exp, obd_count keylen,
- void *key, obd_count vallen, void *val)
+static int osc_setinfo_mds_conn_interpret(struct ptlrpc_request *req,
+ void *aa, int rc)
+{
+ struct llog_ctxt *ctxt;
+ struct obd_import *imp = req->rq_import;
+ ENTRY;
+
+ if (rc != 0)
+ RETURN(rc);
+
+ ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT);
+ if (ctxt) {
+ if (rc == 0)
+ rc = llog_initiator_connect(ctxt);
+ else
+ CERROR("cannot establish connection for "
+ "ctxt %p: %d\n", ctxt, rc);
+ }
+
+ imp->imp_server_timeout = 1;
+ CDEBUG(D_HA, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd));
+ imp->imp_pingable = 1;
+
+ RETURN(rc);
+}
+
+static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
+ void *key, obd_count vallen, void *val,
+ struct ptlrpc_request_set *set)
{
struct ptlrpc_request *req;
struct obd_device *obd = exp->exp_obd;
struct obd_import *imp = class_exp2cliimp(exp);
- struct llog_ctxt *ctxt;
- int rc, size[2] = {keylen, vallen};
+ int size[2] = {keylen, vallen};
char *bufs[2] = {key, val};
ENTRY;
RETURN(0);
}
-
+
if (KEY_IS("unlinked")) {
struct osc_creator *oscc = &obd->u.cli.cl_oscc;
spin_lock(&oscc->oscc_lock);
}
if (KEY_IS("initial_recov")) {
- struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
if (vallen != sizeof(int))
RETURN(-EINVAL);
imp->imp_initial_recov = *(int *)val;
RETURN(0);
}
- if (!KEY_IS("mds_conn") && !KEY_IS("evict_by_nid"))
+ if (!set)
RETURN(-EINVAL);
+ /* We pass all other commands directly to OST. Since nobody calls osc
+ methods directly and everybody is supposed to go through LOV, we
+ assume lov checked invalid values for us.
+ The only recognised values so far are evict_by_nid and mds_conn.
+ Even if something bad goes through, we'd get a -EINVAL from OST
+ anyway. */
req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_SET_INFO,
2, size, bufs);
RETURN(-ENOMEM);
req->rq_replen = lustre_msg_size(0, NULL);
- rc = ptlrpc_queue_wait(req);
- ptlrpc_req_finished(req);
- ctxt = llog_get_context(exp->exp_obd, LLOG_MDS_OST_ORIG_CTXT);
- if (ctxt) {
- if (rc == 0)
- rc = llog_initiator_connect(ctxt);
- else
- CERROR("cannot establish connection for ctxt %p: %d\n",
- ctxt, rc);
- }
-
- imp->imp_server_timeout = 1;
- CDEBUG(D_HA, "pinging OST %s\n", imp->imp_target_uuid.uuid);
- imp->imp_pingable = 1;
+ if (KEY_IS("mds_conn"))
+ req->rq_interpret_reply = osc_setinfo_mds_conn_interpret;
+ ptlrpc_set_add_req(set, req);
+ ptlrpc_check_set(set);
- RETURN(rc);
+ RETURN(0);
}
int osc_setup(struct obd_device *obd, obd_count len, void *buf)
{
int rc;
+ ENTRY;
ENTRY;
rc = ptlrpcd_addref();
previous ones. Ideally we want to have 2x max_rpcs_in_flight
reserved, but I afraid that might be too much wasted RAM
in fact, so 2 is just my guess and still should work. */
- cli->cl_rq_pool = ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
- OST_MAXREQSIZE,
- ptlrpc_add_rqs_to_pool);
+ cli->cl_import->imp_rq_pool =
+ ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+ OST_MAXREQSIZE,
+ ptlrpc_add_rqs_to_pool);
}
RETURN(rc);
}
-static int osc_precleanup(struct obd_device *obd, int stage)
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
{
int rc = 0;
ENTRY;
ptlrpc_deactivate_import(imp);
break;
}
+ case OBD_CLEANUP_EXPORTS:
+ break;
case OBD_CLEANUP_SELF_EXP:
rc = obd_llog_finish(obd, 0);
if (rc != 0)
CERROR("failed to cleanup llogging subsystems\n");
+ break;
+ case OBD_CLEANUP_OBD:
+ break;
}
RETURN(rc);
}
int osc_cleanup(struct obd_device *obd)
{
struct osc_creator *oscc = &obd->u.cli.cl_oscc;
- struct client_obd *cli = &obd->u.cli;
int rc;
ENTRY;
rc = client_obd_cleanup(obd);
- ptlrpc_free_rq_pool(cli->cl_rq_pool);
-
ptlrpcd_decref();
RETURN(rc);
}
.o_join_lru = osc_join_lru,
.o_iocontrol = osc_iocontrol,
.o_get_info = osc_get_info,
- .o_set_info = osc_set_info,
+ .o_set_info_async = osc_set_info_async,
.o_import_event = osc_import_event,
.o_llog_init = osc_llog_init,
.o_llog_finish = osc_llog_finish,
#if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct obd_ops sanosc_obd_ops = {
.o_owner = THIS_MODULE,
- .o_cleanup = client_obd_cleanup,
+ .o_setup = client_sanobd_setup,
+ .o_precleanup = osc_precleanup,
+ .o_cleanup = osc_cleanup,
.o_add_conn = client_import_add_conn,
.o_del_conn = client_import_del_conn,
.o_connect = client_connect_import,
.o_getattr = osc_getattr,
.o_getattr_async = osc_getattr_async,
.o_setattr = osc_setattr,
- .o_setup = client_sanobd_setup,
.o_brw = sanosc_brw,
.o_punch = osc_punch,
.o_sync = osc_sync,
};
#endif
-static quota_interface_t *quota_interface;
extern quota_interface_t osc_quota_interface;
int __init osc_init(void)
GOTO(out, rc = 0);
}
- rc = obd_set_info(exp, keylen, key, vallen, val);
+ rc = obd_set_info_async(exp, keylen, key, vallen, val, NULL);
out:
req->rq_repmsg->status = 0;
RETURN(rc);
if (rc)
GOTO(out_io, rc = -EINVAL);
+ ping_evictor_start();
+
RETURN(0);
out_io:
int err = 0;
ENTRY;
+ ping_evictor_stop();
+
spin_lock_bh(&obd->obd_processing_task_lock);
if (obd->obd_recovering) {
target_cancel_recovery_timer(obd);
noinst_LIBRARIES = libptlrpc.a
libptlrpc_a_SOURCES = $(COMMON_SOURCES)
-libptlrpc_a_CPPFLAGS = $(LLCPPFLGS)
+libptlrpc_a_CPPFLAGS = $(LLCPPFLAGS)
libptlrpc_a_CFLAGS = $(LLCFLAGS)
endif
endif # MODULES
install-data-hook: $(install_data_hook)
-
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c
DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c
list_add_tail(&req->rq_set_chain, &set->set_requests);
req->rq_set = set;
set->set_remaining++;
+
atomic_inc(&req->rq_import->imp_inflight);
}
spin_lock_irqsave(&imp->imp_lock, flags);
}
- if (req->rq_transno > imp->imp_max_transno)
- imp->imp_max_transno = req->rq_transno;
-
/* Replay-enabled imports return commit-status information. */
if (req->rq_repmsg->last_committed)
imp->imp_peer_committed_transno =
RETURN(1);
}
- ptlrpc_fail_import(imp, req->rq_import_generation);
+ ptlrpc_fail_import(imp, req->rq_reqmsg->conn_cnt);
RETURN(0);
}
int rc, timeout;
ENTRY;
- LASSERT(!list_empty(&set->set_requests));
+ if (list_empty(&set->set_requests))
+ RETURN(0);
+
list_for_each(tmp, &set->set_requests) {
req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
if (req->rq_phase == RQ_PHASE_NEW)
LASSERT_SPIN_LOCKED(&imp->imp_lock);
- CDEBUG(D_HA, "%s: committing for last_committed "LPU64"\n",
- imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+
+ if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+ imp->imp_generation == imp->imp_last_generation_checked) {
+ CDEBUG(D_HA, "%s: skip recheck for last_committed "LPU64"\n",
+ imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+ return;
+ }
+
+ CDEBUG(D_HA, "%s: committing for last_committed "LPU64" gen %d\n",
+ imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+ imp->imp_generation);
+ imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+ imp->imp_last_generation_checked = imp->imp_generation;
list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
spin_lock_irqsave(&req->rq_lock, flags);
req->rq_net_err = 1;
spin_unlock_irqrestore(&req->rq_lock, flags);
-
+
ptlrpc_wake_client_req(req);
}
- /* this balances the atomic_inc in ptl_send_rpc() */
+ /* these balance the references in ptl_send_rpc() */
+ atomic_dec(&req->rq_import->imp_inflight);
ptlrpc_req_finished(req);
+
EXIT;
}
do { \
if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
- imp, imp->imp_target_uuid.uuid, \
+ imp, obd2cli_tgt(imp->imp_obd), \
ptlrpc_import_state_name(imp->imp_state), \
ptlrpc_import_state_name(state)); \
imp->imp_state = state; \
/* Returns true if import was FULL, false if import was already not
* connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ * and caused the disconnection. In some cases, multiple
+ * inflight requests can fail to a single target (e.g. OST
+ * bulk requests) and if one has already caused a reconnection
+ * (increasing the import->conn_cnt) the older failure should
+ * not also cause a reconnection. If zero it forces a reconnect.
*/
-int ptlrpc_set_import_discon(struct obd_import *imp)
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
{
unsigned long flags;
int rc = 0;
spin_lock_irqsave(&imp->imp_lock, flags);
- if (imp->imp_state == LUSTRE_IMP_FULL) {
+ if (imp->imp_state == LUSTRE_IMP_FULL &&
+ (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
char *target_start;
int target_len;
- deuuidify(imp->imp_target_uuid.uuid, NULL,
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
&target_start, &target_len);
LCONSOLE_ERROR("%s: Connection to service %.*s via nid %s was "
imp->imp_replayable ?
"wait for recovery to complete" : "fail");
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+ spin_unlock_irqrestore(&imp->imp_lock, flags);
+
if (obd_dump_on_timeout)
libcfs_debug_dumplog();
- IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
- spin_unlock_irqrestore(&imp->imp_lock, flags);
obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
rc = 1;
} else {
spin_unlock_irqrestore(&imp->imp_lock, flags);
- CDEBUG(D_HA, "%p %s: import already not connected: %s\n",
- imp,imp->imp_client->cli_name,
- ptlrpc_import_state_name(imp->imp_state));
+ CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+ imp->imp_client->cli_name, imp,
+ (imp->imp_state == LUSTRE_IMP_FULL &&
+ imp->imp_conn_cnt > conn_cnt) ?
+ "reconnected" : "not connected", imp->imp_conn_cnt,
+ conn_cnt, ptlrpc_import_state_name(imp->imp_state));
}
return rc;
ENTRY;
spin_lock_irqsave(&imp->imp_lock, flags);
- CDEBUG(D_HA, "setting import %s INVALID\n", imp->imp_target_uuid.uuid);
+ CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
imp->imp_invalid = 1;
imp->imp_generation++;
spin_unlock_irqrestore(&imp->imp_lock, flags);
if (rc)
CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
- imp->imp_target_uuid.uuid, rc,
+ obd2cli_tgt(imp->imp_obd), rc,
atomic_read(&imp->imp_inflight));
obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
}
-void ptlrpc_fail_import(struct obd_import *imp, int generation)
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
{
ENTRY;
- LASSERT (!imp->imp_dlm_fake);
+ LASSERT(!imp->imp_dlm_fake);
- if (ptlrpc_set_import_discon(imp)) {
+ if (ptlrpc_set_import_discon(imp, conn_cnt)) {
unsigned long flags;
if (!imp->imp_replayable) {
CDEBUG(D_HA, "import %s@%s for %s not replayable, "
"auto-deactivating\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid,
imp->imp_obd->obd_name);
ptlrpc_deactivate_import(imp);
}
CDEBUG(D_HA, "%s: waking up pinger\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
spin_lock_irqsave(&imp->imp_lock, flags);
imp->imp_force_verify = 1;
int rc;
__u64 committed_before_reconnect = 0;
struct ptlrpc_request *request;
- int size[] = {sizeof(imp->imp_target_uuid),
+ int size[] = {sizeof(imp->imp_obd->u.cli.cl_target_uuid),
sizeof(obd->obd_uuid),
sizeof(imp->imp_dlm_handle),
sizeof(imp->imp_connect_data)};
- char *tmp[] = {imp->imp_target_uuid.uuid,
+ char *tmp[] = {obd2cli_tgt(imp->imp_obd),
obd->obd_uuid.uuid,
(char *)&imp->imp_dlm_handle,
(char *)&imp->imp_connect_data};
/* last in list */
(imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) {
CDEBUG(D_HA, "Last connection attempt (%d) for %s\n",
- imp->imp_conn_cnt, imp->imp_target_uuid.uuid);
+ imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd));
/* Don't retry if connect fails */
rc = 0;
- obd_set_info(obd->obd_self_export,
- strlen("initial_recov"), "initial_recov",
- sizeof(rc), &rc);
+ obd_set_info_async(obd->obd_self_export,
+ strlen("initial_recov"), "initial_recov",
+ sizeof(rc), &rc, NULL);
}
rc = obd_reconnect(imp->imp_obd->obd_self_export, obd,
if (aa->pcaa_initial_connect) {
if (msg_flags & MSG_CONNECT_REPLAYABLE) {
CDEBUG(D_HA, "connected to replayable target: %s\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
imp->imp_replayable = 1;
} else {
imp->imp_replayable = 0;
if (!memcmp(&old_hdl, &request->rq_repmsg->handle,
sizeof (old_hdl))) {
CERROR("%s@%s didn't like our handle "LPX64
- ", failed\n", imp->imp_target_uuid.uuid,
+ ", failed\n", obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid,
imp->imp_dlm_handle.cookie);
GOTO(out, rc = -ENOTCONN);
sizeof(imp->imp_remote_handle))) {
CERROR("%s@%s changed handle from "LPX64" to "LPX64
"; copying, but this may foreshadow disaster\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid,
imp->imp_remote_handle.cookie,
request->rq_repmsg->handle.cookie);
imp->imp_remote_handle = request->rq_repmsg->handle;
} else {
CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
}
} else if (MSG_CONNECT_RECOVERING & msg_flags) {
CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
imp->imp_obd->obd_name,
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
imp->imp_resend_replay = 1;
IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
} else {
" was previously committed, server now claims "LPD64
")! See https://bugzilla.clusterfs.com/"
"long_list.cgi?buglist=9646\n",
- imp->imp_target_uuid.uuid, aa->pcaa_peer_committed,
+ obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
request->rq_repmsg->last_committed);
}
if (rc == -ENOTCONN) {
CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
"invalidating and reconnecting\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
ptlrpc_connect_import(imp, NULL);
RETURN(0);
CWARN("Server %s version (%d.%d.%d.%d) is much newer. "
"Consider %s (%s).\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
OBD_OCD_VERSION_MINOR(ocd->ocd_version),
OBD_OCD_VERSION_PATCH(ocd->ocd_version),
"refused connection from this client "
"as too old version (%s). Client must "
"be recompiled\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
OBD_OCD_VERSION_MINOR(ocd->ocd_version),
OBD_OCD_VERSION_PATCH(ocd->ocd_version),
ptlrpc_maybe_ping_import_soon(imp);
CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
(char *)imp->imp_connection->c_remote_uuid.uuid, rc);
}
ENTRY;
- lock_kernel();
- ptlrpc_daemonize();
-
- cfs_block_allsigs();
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "ll_imp_inval");
- unlock_kernel();
-
+ ptlrpc_daemonize("ll_imp_inval");
+
CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
- imp->imp_obd->obd_name, imp->imp_target_uuid.uuid,
+ imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
ptlrpc_invalidate_import(imp);
ENTRY;
if (imp->imp_state == LUSTRE_IMP_EVICTED) {
- deuuidify(imp->imp_target_uuid.uuid, NULL,
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
&target_start, &target_len);
LCONSOLE_ERROR("This client was evicted by %.*s; in progress "
"operations using this service will fail.\n",
target_len, target_start);
CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
#ifdef __KERNEL__
if (imp->imp_state == LUSTRE_IMP_REPLAY) {
CDEBUG(D_HA, "replay requested by %s\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
rc = ptlrpc_replay_next(imp, &inflight);
if (inflight == 0 &&
atomic_read(&imp->imp_replay_inflight) == 0) {
if (imp->imp_state == LUSTRE_IMP_RECOVER) {
CDEBUG(D_HA, "reconnected to %s@%s\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
rc = ptlrpc_resend(imp);
IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
ptlrpc_activate_import(imp);
- deuuidify(imp->imp_target_uuid.uuid, NULL,
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
&target_start, &target_len);
LCONSOLE_INFO("%s: Connection restored to service %.*s "
"using nid %s.\n", imp->imp_obd->obd_name,
case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
default:
CERROR("don't know how to disconnect from %s (connect_op %d)\n",
- imp->imp_target_uuid.uuid, imp->imp_connect_op);
+ obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
RETURN(-EINVAL);
}
request->rq_reply_portal);
}
- ptlrpc_request_addref(request); /* +1 ref for the SENT callback */
+ /* add references on request and import for request_out_callback */
+ ptlrpc_request_addref(request);
+ atomic_inc(&request->rq_import->imp_inflight);
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
request->rq_sent = CURRENT_SECONDS;
ptlrpc_pinger_sending_on_import(request->rq_import);
- rc = ptl_send_buf(&request->rq_req_md_h,
+ rc = ptl_send_buf(&request->rq_req_md_h,
request->rq_reqmsg, request->rq_reqlen,
- LNET_NOACK_REQ, &request->rq_req_cbid,
+ LNET_NOACK_REQ, &request->rq_req_cbid,
connection,
request->rq_request_portal,
request->rq_xid);
RETURN(rc);
}
- ptlrpc_req_finished (request); /* drop callback ref */
+ /* drop request_out_callback refs, we couldn't start the send */
+ atomic_dec(&request->rq_import->imp_inflight);
+ ptlrpc_req_finished (request);
if (noreply)
RETURN(rc);
(long long)FMODE_READ);
LASSERTF(FMODE_WRITE == 2, " found %lld\n",
(long long)FMODE_WRITE);
- LASSERTF(FMODE_EXEC == 4, " found %lld\n",
- (long long)FMODE_EXEC);
+ LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n",
+ (long long)MDS_FMODE_EXEC);
CLASSERT(MDS_OPEN_CREAT == 00000100);
CLASSERT(MDS_OPEN_EXCL == 00000200);
CLASSERT(MDS_OPEN_TRUNC == 00001000);
if (req) {
DEBUG_REQ(D_INFO, req, "pinging %s->%s",
imp->imp_obd->obd_uuid.uuid,
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
req->rq_no_resend = req->rq_no_delay = 1;
req->rq_replen = lustre_msg_size(0, NULL);
ptlrpcd_add_req(req);
} else {
CERROR("OOM trying to ping %s->%s\n",
imp->imp_obd->obd_uuid.uuid,
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
rc = -ENOMEM;
}
struct ptlrpc_thread *thread = data->thread;
ENTRY;
- lock_kernel();
- ptlrpc_daemonize();
-
- cfs_block_allsigs();
-
- LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX,
- "name %d > len %d\n",
- (int)strlen(data->name), CFS_CURPROC_COMM_MAX);
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name);
- unlock_kernel();
+ cfs_daemonize(data->name);
/* Record that the thread is running */
thread->t_flags = SVC_RUNNING;
CDEBUG(D_HA, "not pinging %s "
"(in recovery: %s or recovery "
"disabled: %u/%u)\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
ptlrpc_import_state_name(level),
imp->imp_deactive,
imp->imp_obd->obd_no_recov);
CDEBUG(D_INFO,
"don't need to ping %s ("CFS_TIME_T
" > "CFS_TIME_T")\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_next_ping, this_ping);
}
time_to_next_ping = cfs_time_sub(cfs_time_add(this_ping,
cfs_time_seconds(PING_INTERVAL)),
cfs_time_current());
-
+
/* The ping sent by ptlrpc_send_rpc may get sent out
say .01 second after this.
ptlrpc_pinger_eending_on_import will then set the
mutex_down(&pinger_sem);
CDEBUG(D_HA, "adding pingable import %s->%s\n",
- imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
ptlrpc_update_next_ping(imp);
/* XXX sort, blah blah */
list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
mutex_down(&pinger_sem);
list_del_init(&imp->imp_pinger_chain);
CDEBUG(D_HA, "removing pingable import %s->%s\n",
- imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
class_import_put(imp);
mutex_up(&pinger_sem);
RETURN(0);
#endif
}
+/* Ping evictor thread */
+#define PET_READY 1
+#define PET_TERMINATE 2
+
+static int pet_refcount = 0;
+static int pet_state;
+static wait_queue_head_t pet_waitq;
+static struct obd_export *pet_exp = NULL;
+static spinlock_t pet_lock = SPIN_LOCK_UNLOCKED;
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+ spin_lock(&pet_lock);
+ if (pet_exp) {
+ /* eventually the new obd will call here again. */
+ spin_unlock(&pet_lock);
+ return 1;
+ }
+
+ /* We have to make sure the obd isn't destroyed between now and when
+ * the ping evictor runs. We'll take a reference here, and drop it
+ * when we finish in the evictor. We don't really care about this
+ * export in particular; we just need one to keep the obd alive. */
+ pet_exp = class_export_get(exp);
+ spin_unlock(&pet_lock);
+
+ wake_up(&pet_waitq);
+ return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+ struct obd_device *obd;
+ struct obd_export *exp;
+ struct l_wait_info lwi = { 0 };
+ time_t expire_time;
+ ENTRY;
+
+ ptlrpc_daemonize("ping_evictor");
+
+ CDEBUG(D_HA, "Starting Ping Evictor\n");
+ pet_exp = NULL;
+ pet_state = PET_READY;
+ while (1) {
+ l_wait_event(pet_waitq, pet_exp ||
+ (pet_state == PET_TERMINATE), &lwi);
+ if (pet_state == PET_TERMINATE)
+ break;
+
+ /* we only get here if pet_exp != NULL, and the end of this
+ * loop is the only place which sets it NULL again, so lock
+ * is not strictly necessary. */
+ spin_lock(&pet_lock);
+ obd = pet_exp->exp_obd;
+ spin_unlock(&pet_lock);
+
+ expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+
+ CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+ obd->obd_name, expire_time);
+
+ /* Exports can't be deleted out of the list while we hold
+ * the obd lock (class_unlink_export), which means we can't
+ * lose the last ref on the export. If they've already been
+ * removed from the list, we won't find them here. */
+ spin_lock(&obd->obd_dev_lock);
+ while (!list_empty(&obd->obd_exports_timed)) {
+ exp = list_entry(obd->obd_exports_timed.next,
+ struct obd_export,exp_obd_chain_timed);
+
+ if (expire_time > exp->exp_last_request_time) {
+ class_export_get(exp);
+ spin_unlock(&obd->obd_dev_lock);
+ LCONSOLE_WARN("%s: haven't heard from %s in %ld"
+ " seconds. Last request was at %ld. "
+ "I think it's dead, and I am evicting "
+ "it.\n", obd->obd_name,
+ obd_export_nid2str(exp),
+ (long)(CURRENT_SECONDS -
+ exp->exp_last_request_time),
+ exp->exp_last_request_time);
+
+
+ class_fail_export(exp);
+ class_export_put(exp);
+
+ spin_lock(&obd->obd_dev_lock);
+ } else {
+ /* List is sorted, so everyone below is ok */
+ break;
+ }
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ class_export_put(pet_exp);
+
+ spin_lock(&pet_lock);
+ pet_exp = NULL;
+ spin_unlock(&pet_lock);
+ }
+ CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+ RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+ int rc;
+
+ if (++pet_refcount > 1)
+ return;
+
+ init_waitqueue_head(&pet_waitq);
+
+ rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
+ if (rc < 0) {
+ pet_refcount--;
+ CERROR("Cannot start ping evictor thread: %d\n", rc);
+ }
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+ if (--pet_refcount > 0)
+ return;
+
+ pet_state = PET_TERMINATE;
+ wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
#else /* !__KERNEL__ */
/* XXX
if (level != LUSTRE_IMP_FULL) {
CDEBUG(D_HA,
"not pinging %s (in recovery)\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
continue;
}
ptlrpc_set_add_req(set, req);
} else {
CDEBUG(D_HA, "don't need to ping %s ("CFS_TIME_T" > "
- CFS_TIME_T")\n", imp->imp_target_uuid.uuid,
+ CFS_TIME_T")\n", obd2cli_tgt(imp->imp_obd),
imp->imp_next_ping, pd->pd_this_ping);
}
}
rq_set_chain);
DEBUG_REQ(D_HA, req, "pinging %s->%s",
req->rq_import->imp_obd->obd_uuid.uuid,
- req->rq_import->imp_target_uuid.uuid);
+ obd2cli_tgt(req->rq_import->imp_obd));
(void)ptl_send_rpc(req, 0);
}
RETURN(-EALREADY);
CDEBUG(D_HA, "adding pingable import %s->%s\n",
- imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
ptlrpc_pinger_sending_on_import(imp);
mutex_down(&pinger_sem);
mutex_down(&pinger_sem);
list_del_init(&imp->imp_pinger_chain);
CDEBUG(D_HA, "removing pingable import %s->%s\n",
- imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
class_import_put(imp);
mutex_up(&pinger_sem);
RETURN(0);
void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
void lustre_assert_wire_constants(void);
int ptlrpc_import_in_recovery(struct obd_import *imp);
-int ptlrpc_set_import_discon(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
void ptlrpc_handle_failed_import(struct obd_import *imp);
int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
void ptlrpc_initiate_recovery(struct obd_import *imp);
#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
-#endif /* __KERNEL__ */
+#endif /* LPROCFS */
/* recovd_thread.c */
int llog_init_commit_master(void);
void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
void ptlrpc_pinger_wake_up(void);
void ptlrpc_ping_import_soon(struct obd_import *imp);
+#ifdef __KERNEL__
+int ping_evictor_wake(struct obd_export *exp);
+#else
+#define ping_evictor_wake(exp) 1
+#endif
#endif /* PTLRPC_INTERNAL_H */
cleanup_phase = 2;
ptlrpc_put_connection_superhack = ptlrpc_put_connection;
- ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
rc = ptlrpc_start_pinger();
if (rc)
ENTRY;
cfs_daemonize(pc->pc_name);
- cfs_block_allsigs();
complete(&pc->pc_starting);
if (lcd == NULL)
RETURN(-ENOMEM);
- lock_kernel();
- ptlrpc_daemonize(); /* thread never needs to do IO */
-
- cfs_block_allsigs();
-
spin_lock(&lcm->lcm_thread_lock);
THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1,
"ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total));
atomic_inc(&lcm->lcm_thread_total);
spin_unlock(&lcm->lcm_thread_lock);
- unlock_kernel();
+
+ ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */
CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list);
CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list);
}
mutex_up(&llcd->llcd_ctxt->loc_sem);
- if (!import || (import == LP_POISON)) {
+ if (!import || (import == LP_POISON) ||
+ (import->imp_client == LP_POISON)) {
CERROR("No import %p (llcd=%p, ctxt=%p)\n",
import, llcd, llcd->llcd_ctxt);
llcd_put(llcd);
ENTRY;
mutex_up(&data->llpa_sem);
- lock_kernel();
- ptlrpc_daemonize(); /* thread does IO to log files */
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "llog_process");
-
- cfs_block_allsigs();
- unlock_kernel();
+ ptlrpc_daemonize("llog_process"); /* thread does IO to log files */
rc = llog_create(ctxt, &llh, &logid, NULL);
if (rc) {
argv[0] = obd_lustre_upcall;
argv[1] = "FAILED_IMPORT";
- argv[2] = imp->imp_target_uuid.uuid;
+ argv[2] = obd2cli_tgt(imp->imp_obd);
argv[3] = imp->imp_obd->obd_name;
argv[4] = imp->imp_connection->c_remote_uuid.uuid;
argv[5] = imp->imp_obd->obd_uuid.uuid;
if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
CDEBUG(D_HA, "%s: starting recovery without upcall\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
ptlrpc_connect_import(imp, NULL);
} else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
CDEBUG(D_HA, "%s: recovery disabled\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
} else {
CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
ptlrpc_run_failed_import_upcall(imp);
}
* get rid of them now.
*/
spin_lock_irqsave(&imp->imp_lock, flags);
+ imp->imp_last_transno_checked = 0;
ptlrpc_free_committed(imp);
last_transno = imp->imp_last_replay_transno;
spin_unlock_irqrestore(&imp->imp_lock, flags);
CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
- imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno,
- last_transno);
+ imp, obd2cli_tgt(imp->imp_obd),
+ imp->imp_peer_committed_transno, last_transno);
/* Do I need to hold a lock across this iteration? We shouldn't be
* racing with any additions to the list, because we're in recovery
ENTRY;
CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
- imp->imp_obd->obd_name,
- imp->imp_target_uuid.uuid,
+ imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
- if (ptlrpc_set_import_discon(imp)) {
+ if (ptlrpc_set_import_discon(imp, failed_req->rq_reqmsg->conn_cnt)) {
if (!imp->imp_replayable) {
CDEBUG(D_HA, "import %s@%s for %s not replayable, "
"auto-deactivating\n",
- imp->imp_target_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid,
imp->imp_obd->obd_name);
ptlrpc_deactivate_import(imp);
* requests. */
if (!active) {
CWARN("setting import %s INACTIVE by administrator request\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
ptlrpc_invalidate_import(imp);
imp->imp_deactive = 1;
}
if (active) {
imp->imp_deactive = 0;
CDEBUG(D_HA, "setting import %s VALID\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
rc = ptlrpc_recover_import(imp, NULL);
}
ENTRY;
/* force import to be disconnected. */
- ptlrpc_set_import_discon(imp);
+ ptlrpc_set_import_discon(imp, 0);
imp->imp_deactive = 0;
rc = ptlrpc_recover_import_no_retry(imp, new_uuid);
RETURN(rc);
CDEBUG(D_HA, "%s: recovery started, waiting\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
NULL, NULL);
rc = l_wait_event(imp->imp_recovery_waitq,
!ptlrpc_import_in_recovery(imp), &lwi);
CDEBUG(D_HA, "%s: recovery finished\n",
- imp->imp_target_uuid.uuid);
+ obd2cli_tgt(imp->imp_obd));
RETURN(rc);
}
spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
}
-static long
-timeval_sub(struct timeval *large, struct timeval *small)
-{
- return (large->tv_sec - small->tv_sec) * 1000000 +
- (large->tv_usec - small->tv_usec);
-}
-
static int
ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
{
}
+/* This function makes sure dead exports are evicted in a timely manner.
+ This function is only called when some export receives a message (i.e.,
+ the network is up.) */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+ struct obd_export *oldest_exp;
+ time_t oldest_time;
+
+ ENTRY;
+
+ LASSERT(exp);
+
+ /* Compensate for slow machines, etc, by faking our request time
+ into the future. Although this can break the strict time-ordering
+ of the list, we can be really lazy here - we don't have to evict
+ at the exact right moment. Eventually, all silent exports
+ will make it to the top of the list. */
+ exp->exp_last_request_time = max(exp->exp_last_request_time,
+ (time_t)CURRENT_SECONDS + extra_delay);
+
+ CDEBUG(D_INFO, "updating export %s at %ld\n",
+ exp->exp_client_uuid.uuid,
+ exp->exp_last_request_time);
+
+ /* exports may get disconnected from the chain even though the
+ export has references, so we must keep the spin lock while
+ manipulating the lists */
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+
+ if (list_empty(&exp->exp_obd_chain_timed)) {
+ /* this one is not timed */
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ EXIT;
+ return;
+ }
+
+ list_move_tail(&exp->exp_obd_chain_timed,
+ &exp->exp_obd->obd_exports_timed);
+
+ oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+ struct obd_export, exp_obd_chain_timed);
+ oldest_time = oldest_exp->exp_last_request_time;
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+ if (exp->exp_obd->obd_recovering) {
+ /* be nice to everyone during recovery */
+ EXIT;
+ return;
+ }
+
+ /* Note - racing to start/reset the obd_eviction timer is safe */
+ if (exp->exp_obd->obd_eviction_timer == 0) {
+ /* Check if the oldest entry is expired. */
+ if (CURRENT_SECONDS > (oldest_time +
+ (3 * obd_timeout / 2) + extra_delay)) {
+ /* We need a second timer, in case the net was down and
+ * it just came back. Since the pinger may skip every
+ * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+ * we better wait for 3. */
+ exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
+ 3 * PING_INTERVAL;
+ CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
+ exp->exp_obd->obd_name, obd_export_nid2str(exp),
+ oldest_time);
+ }
+ } else {
+ if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
+ extra_delay)) {
+ /* The evictor won't evict anyone who we've heard from
+ * recently, so we don't have to check before we start
+ * it. */
+ if (!ping_evictor_wake(exp))
+ exp->exp_obd->obd_eviction_timer = 0;
+ }
+ }
+
+ EXIT;
+}
+
static int
ptlrpc_server_handle_request(struct ptlrpc_service *svc,
struct ptlrpc_thread *thread)
spin_unlock_irqrestore (&svc->srv_lock, flags);
do_gettimeofday(&work_start);
- timediff = timeval_sub(&work_start, &request->rq_arrival_time);
+ timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
if (svc->srv_stats != NULL) {
lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
timediff);
goto put_conn;
}
- class_update_export_timer(request->rq_export,
- (time_t)(timediff / 500000));
+ ptlrpc_update_export_timer(request->rq_export, timediff/500000);
}
/* Discard requests queued for longer than my timeout. If the
out:
do_gettimeofday(&work_end);
- timediff = timeval_sub(&work_end, &work_start);
+ timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
if (timediff / 1000000 > (long)obd_timeout)
CERROR("request "LPU64" opc %u from %s processed in %lds "
"trans "LPU64" rc %d/%d\n",
request->rq_xid, request->rq_reqmsg->opc,
libcfs_id2str(request->rq_peer),
- timeval_sub(&work_end,
- &request->rq_arrival_time) / 1000000,
+ cfs_timeval_sub(&work_end, &request->rq_arrival_time,
+ NULL) / 1000000,
request->rq_repmsg ? request->rq_repmsg->transno :
request->rq_transno, request->rq_status,
request->rq_repmsg ? request->rq_repmsg->status : -999);
"%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
request->rq_xid, request->rq_reqmsg->opc,
libcfs_id2str(request->rq_peer), timediff,
- timeval_sub(&work_end, &request->rq_arrival_time),
+ cfs_timeval_sub(&work_end, &request->rq_arrival_time,
+ NULL),
request->rq_transno, request->rq_status,
request->rq_repmsg ? request->rq_repmsg->status : -999);
#else /* __KERNEL__ */
/* Don't use daemonize, it removes fs struct from new thread (bug 418) */
-void ptlrpc_daemonize(void)
+void ptlrpc_daemonize(char *name)
{
- exit_mm(cfs_current());
- lustre_daemonize_helper();
-#if LINUX_
- /* XXX Liang: */
+ struct fs_struct *fs = current->fs;
+
+ atomic_inc(&fs->count);
+ cfs_daemonize(name);
+ exit_fs(cfs_current());
+ current->fs = fs;
set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
-#endif
- exit_files(cfs_current());
- reparent_to_init();
}
static void
int rc = 0;
ENTRY;
- lock_kernel();
- ptlrpc_daemonize();
-
- cfs_block_allsigs();
-
- LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX,
- "name %d > len %d\n",
- (int)strlen(data->name), CFS_CURPROC_COMM_MAX);
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name);
- unlock_kernel();
+ ptlrpc_daemonize(data->name);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) && CONFIG_NUMA
/* we need to do this before any per-thread allocation is done so that
struct ptlrpc_request, rq_list);
do_gettimeofday(&right_now);
- timediff = timeval_sub(&right_now, &request->rq_arrival_time);
+ timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
cutoff = obd_health_check_timeout;
static int target_quotacheck_thread(void *data)
{
- unsigned long flags;
struct quotacheck_thread_args *qta = data;
struct obd_export *exp;
struct obd_device *obd;
struct lvfs_run_ctxt saved;
int rc;
- lock_kernel();
- ptlrpc_daemonize();
-
- SIGNAL_MASK_LOCK(current, flags);
- sigfillset(¤t->blocked);
- RECALC_SIGPENDING;
- SIGNAL_MASK_UNLOCK(current, flags);
-
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s",
- "quotacheck");
- unlock_kernel();
+ ptlrpc_daemonize("quotacheck");
exp = qta->qta_exp;
obd = exp->exp_obd;
if (rc == CL_NOT_QUOTACHECKED)
rc = -EINTR;
- qchk->obd_uuid = cli->cl_import->imp_target_uuid;
+ qchk->obd_uuid = cli->cl_target_uuid;
if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME,
strlen(LUSTRE_OSC_NAME)))
memcpy(qchk->obd_type, LUSTRE_FILTER_NAME,
struct qslave_recov_thread_data *data = arg;
struct obd_device *obd = data->obd;
struct lustre_quota_ctxt *qctxt = data->qctxt;
- unsigned long flags;
unsigned int type;
int rc = 0;
ENTRY;
- lock_kernel();
- ptlrpc_daemonize();
-
- SIGNAL_MASK_LOCK(current, flags);
- sigfillset(¤t->blocked);
- RECALC_SIGPENDING;
- SIGNAL_MASK_UNLOCK(current, flags);
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", "qslave_recovd");
- unlock_kernel();
+ ptlrpc_daemonize("qslave_recovd");
complete(&data->comp);
int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
{
struct obd_device *obd = exp->exp_obd;
+ struct obd_device_target *obt = &obd->u.obt;
struct lvfs_run_ctxt saved;
int rc = 0;
ENTRY;
switch (oqctl->qc_cmd) {
case Q_QUOTAON:
case Q_QUOTAOFF:
+ if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+ CDEBUG(D_INFO, "other people are doing quotacheck\n");
+ atomic_inc(&obt->obt_quotachecking);
+ rc = -EBUSY;
+ break;
+ }
case Q_GETOINFO:
case Q_GETOQUOTA:
case Q_GETQUOTA:
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF)
+ atomic_inc(&obt->obt_quotachecking);
break;
case Q_INITQUOTA:
{
rc = cfs_mem_cache_destroy(qinfo_cachep);
LASSERTF(rc == 0, "couldn't destory qinfo_cachep slab\n");
+ qinfo_cachep = NULL;
+
RETURN(0);
}
/* lookup quota file */
rc = 0;
- down(&iparent->i_sem);
+ LOCK_INODE_MUTEX(iparent);
de = lookup_one_len(quotafiles[i], dparent,
strlen(quotafiles[i]));
- up(&iparent->i_sem);
+ UNLOCK_INODE_MUTEX(iparent);
if (IS_ERR(de) || de->d_inode == NULL ||
!S_ISREG(de->d_inode->i_mode))
rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT;
int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
+ struct obd_device_target *obt = &obd->u.obt;
struct lvfs_run_ctxt saved;
int rc;
ENTRY;
+ if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+ CDEBUG(D_INFO, "other people are doing quotacheck\n");
+ atomic_inc(&obt->obt_quotachecking);
+ RETURN(-EBUSY);
+ }
+
down(&mds->mds_qonoff_sem);
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
rc = mds_admin_quota_on(obd, oqctl);
out:
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
up(&mds->mds_qonoff_sem);
+ atomic_inc(&obt->obt_quotachecking);
RETURN(rc);
}
int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
+ struct obd_device_target *obt = &obd->u.obt;
struct lvfs_run_ctxt saved;
int rc, rc2;
ENTRY;
+ if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+ CDEBUG(D_INFO, "other people are doing quotacheck\n");
+ atomic_inc(&obt->obt_quotachecking);
+ RETURN(-EBUSY);
+ }
+
down(&mds->mds_qonoff_sem);
/* close admin quota files */
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
up(&mds->mds_qonoff_sem);
+ atomic_inc(&obt->obt_quotachecking);
+
RETURN(rc ?: rc2);
}
{
struct qmaster_recov_thread_data *data = arg;
struct obd_device *obd = data->obd;
- unsigned long flags;
int rc = 0;
unsigned short type;
ENTRY;
- lock_kernel();
- ptlrpc_daemonize();
-
- SIGNAL_MASK_LOCK(current, flags);
- sigfillset(¤t->blocked);
- RECALC_SIGPENDING;
- SIGNAL_MASK_UNLOCK(current, flags);
- THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s",
- "qmaster_recovd");
- unlock_kernel();
+ ptlrpc_daemonize("qmaster_recovd");
complete(&data->comp);
# Source function library.
if [ -f /etc/init.d/functions ] ; then
- . /etc/init.d/functions
+ . /etc/init.d/functions
fi
# Source networking configuration.
if [ -f /etc/sysconfig/network ] ; then
- . /etc/sysconfig/network
+ . /etc/sysconfig/network
fi
check_start_stop() {
- # Check that networking is up.
- [ "${NETWORKING}" = "no" ] && exit 0
+ # Exit codes now LSB compliant
+ # Check that networking is up. - exit 'not running'
+ [ "${NETWORKING}" = "no" ] && exit 7
- [ -x ${LCONF} -a -x ${LCTL} ] || exit 0
+ # exit 'not installed'
+ [ -x ${LCONF} -a -x ${LCTL} ] || exit 5
if [ ${LUSTRE_CONFIG_XML:0:1} = "/" ] ; then
- if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then
- echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping."
- exit 0
- fi
+ if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then
+ echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping."
+ # exit 'not configured'
+ exit 6
+ fi
fi
# Create /var/lustre directory
start() {
if [ -x "/usr/sbin/clustat" -a "${SERVICE}" = "lustre" ] ; then
- if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then
+ if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then
cat >&2 <<EOF
This script was run directly, which can be dangerous if you are using
clumanager to manage Lustre services.
touch /etc/lustre/start-despite-clumanager
EOF
- RETVAL=1
+ RETVAL=6 # program not configured
return
fi
fi
echo -n "Starting $SERVICE: "
if [ $UID -ne 0 ]; then
echo "Lustre should be started as root"
- RETVAL=1
+ RETVAL=4 # insufficent privileges
return
fi
${LCONF} ${LCONF_START_ARGS}
echo -n "Shutting down $SERVICE: "
if [ $UID -ne 0 ]; then
echo "Lustre should be stopped as root"
- RETVAL=1
+ RETVAL=4 # insufficent privileges
return
fi
${LCONF} ${LCONF_STOP_ARGS}
status() {
STATE="stopped"
- RETVAL=1
+ # LSB compliance - return 3 if service is not running
+ # Lustre-specific returns
+ # 150 - partial startup
+ # 151 - health_check unhealthy
+ # 152 - LBUG
+ RETVAL=3
egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded"
# check for any routes - on a portals router this is the only thing
[ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0
# check for any configured devices (may indicate partial startup)
- [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=1
+ [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=150
# check for either a server or a client filesystem
MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`"
# check for error in health_check
HEALTH="/proc/fs/lustre/health_check"
- [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=2
+ [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=151
# check for LBUG
- [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=3
+ [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152
- # Check if the service really exists
- DUMMY=`lctl dl | grep $SERVICE`
- [ $? -ne 0 ] && STATE="not_found" && RETVAL=5
+ # If Lustre is up , check if the service really exists
+ # Skip this is we are not checking a specific service
+ if [ $RETVAL -eq 0 ] && [ $SERVICE != 'lustre' ]; then
+ DUMMY=`lctl dl | grep $SERVICE`
+ [ $? -ne 0 ] && STATE="not_found" && RETVAL=3
+ fi
echo $STATE
}
SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
PDSH=${PDSH:-no_dsh}
-MDSDEV=${MDSDEV:-$ROOT/tmp/mds1-`hostname`}
+TMP=${TMP:-/tmp}
+MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
MDSSIZE=${MDSSIZE:-100000}
-OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`}
+OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
OSTSIZE=${OSTSIZE:-200000}
FSTYPE=${FSTYPE:-ext3}
TIMEOUT=${TIMEOUT:-20}
}
cleanup() {
- umount_client $MOUNT || return 200
+ umount_client $MOUNT $FORCE || return 200
stop_mds $FORCE || return 201
stop_ost $FORCE || return 202
# catch case where these return just fine, but modules are still not unloaded
# if all the modules have unloaded.
umount $MOUNT &
UMOUNT_PID=$!
- sleep 2
+ sleep 6
echo "killing umount"
kill -TERM $UMOUNT_PID
echo "waiting for umount to finish"
wait $UMOUNT_PID
+ if grep " $MOUNT " /etc/mtab; then
+ echo "test 5: mtab after failed umount"
+ umount $MOUNT &
+ UMOUNT_PID=$!
+ sleep 2
+ echo "killing umount"
+ kill -TERM $UMOUNT_PID
+ echo "waiting for umount to finish"
+ wait $UMOUNT_PID
+ grep " $MOUNT " /etc/mtab && echo "test 5: mtab after second umount" && return 11
+ fi
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
start_ost
[ -d $MOUNT ] || mkdir -p $MOUNT
+ grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before lconf" && return 9
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT && exit 1
+ grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before mount" && return 10
+ llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/mds_svc/client_facet $MOUNT && return 1
+ grep " $MOUNT " /etc/mtab && echo "test 5b: mtab after failed mount" && return 11
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
start_mds
[ -d $MOUNT ] || mkdir -p $MOUNT
+ grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before lconf" && return 9
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1
+ grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before mount" && return 10
+ llmount -vv -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/wrong_mds_svc/client_facet $MOUNT && return 1
+ grep " $MOUNT " /etc/mtab && echo "test 5c: mtab after failed mount" && return 11
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
stop_ost --force
[ -d $MOUNT ] || mkdir -p $MOUNT
+ grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before lconf" && return 9
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`://mds_svc/client_facet $MOUNT || return 1
+ grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before mount" && return 10
+ llmount -vv -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`:/mds_svc/client_facet $MOUNT || return 1
umount_client $MOUNT || return 2
+ grep " $MOUNT " /etc/mtab && echo "test 5d: mtab after unmount" && return 11
stop_mds || return 3
}
run_test 5d "ost down, don't crash during mount attempt"
+test_5e() {
+ start_ost
+ start_mds
+ sleep 5 # give MDS a chance to connect to OSTs before delaying requests
+
+#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
+ do_facet client "sysctl -w lustre.fail_loc=0x80000506"
+ grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10
+ mount_client $MOUNT || echo "mount failed (not fatal)"
+ umount_client $MOUNT || return 2
+ grep " $MOUNT " /etc/mtab && echo "test 5e: mtab after unmount" && return 11
+
+ stop_mds || return 3
+ stop_ost || return 3
+
+ lsmod | grep -q lnet && return 4
+ return 0
+}
+run_test 5e "delayed connect, don't crash (bug 10268)"
+
test_6() {
setup
manual_umount_client
# check the result of lmc --ptldebug/subsystem
start_ost
start_mds
- mount_client $MOUNT
- CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug | sed -e 's/.* = //'`"
+ CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug|cut -d= -f2`"
if [ "$CHECK_PTLDEBUG" ] && [ $CHECK_PTLDEBUG -eq 1 ]; then
echo "lmc --debug success"
else
echo "lmc --subsystem: want 2, have $CHECK_SUBSYS"
return 1
fi
- check_mount || return 41
cleanup || return $?
# the new PTLDEBUG/SUBSYSTEM used for lconf --ptldebug/subsystem
echo "lconf --subsystem: want 20, have $CHECK_SUBSYS"
return 1
fi
- mount_client $MOUNT
- check_mount || return 41
cleanup || return $?
# resume the old configuration
}
test_15() {
- start_ost
- start_mds
echo "mount lustre on ${MOUNT} with $MOUNTLUSTRE....."
if [ -f "$MOUNTLUSTRE" ]; then
echo "save $MOUNTLUSTRE to $MOUNTLUSTRE.sav"
- mv $MOUNTLUSTRE $MOUNTLUSTRE.sav
+ mv $MOUNTLUSTRE $MOUNTLUSTRE.sav && trap cleanup_15 EXIT INT
+ if [ -f $MOUNTLUSTRE ]; then
+ echo "$MOUNTLUSTRE cannot be moved, skipping test"
+ return 0
+ fi
fi
- [ -f "$MOUNTLUSTRE" ] && echo "can't move $MOUNTLUSTRE" && return 40
- trap cleanup_15 EXIT INT
[ ! `cp $(which llmount) $MOUNTLUSTRE` ] || return $?
+ start_ost
+ start_mds
do_facet client "mkdir -p $MOUNT 2> /dev/null"
# load llite module on the client if it isn't in /lib/modules
do_facet client "$LCONF --nosetup --node client_facet $XMLCONFIG"
run_test 15 "zconf-mount without /sbin/mount.lustre (should return error)"
test_16() {
- TMPMTPT="/mnt/conf16"
+ TMPMTPT="${MOUNT%/*}/conf16"
if [ ! -f "$MDSDEV" ]; then
echo "no $MDSDEV existing, so mount Lustre to create one"
run_test 16 "verify that lustre will correct the mode of OBJECTS/LOGS/PENDING"
test_17() {
- TMPMTPT="/mnt/conf17"
+ TMPMTPT="${MOUNT%/*}/conf17"
if [ ! -f "$MDSDEV" ]; then
echo "no $MDSDEV existing, so mount Lustre to create one"
return 1;
}
- if (argc == 6)
- st.st_blksize = strtoul(argv[4], 0, 0);
+ if (argc >= 6)
+ st.st_blksize = strtoul(argv[5], 0, 0);
else if (fstat64(fd, &st) < 0) {
printf("Cannot stat %s: %s\n", argv[1], strerror(errno));
return 1;
exit(1);
}
+#if 0
+ /* We cannot do this any longer, we do not store open special nodes
+ * on MDS after unlink */
if (st1.st_mode != st2.st_mode) { // can we do this?
fprintf(stderr, "fstat different value on %s and %s\n", dname1, dname2);
exit(1);
}
+#endif
fprintf(stderr, "Ok, everything goes well.\n");
return 0;
}
run_test 24 "fsync error (should return error)"
-test_26() { # bug 5921 - evict dead exports
+test_26() { # bug 5921 - evict dead exports by pinger
# this test can only run from a client on a separate node.
[ "`lsmod | grep obdfilter`" ] && \
echo "skipping test 26 (local OST)" && return
}
run_test 26 "evict dead exports"
+test_26b() { # bug 10140 - evict dead exports by pinger
+ zconf_mount `hostname` $MOUNT2
+ MDS_FILE=/proc/fs/lustre/mds/mds_svc/num_exports
+ MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
+ OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+ OST_NEXP1="`do_facet ost cat $OST_FILE | cut -d' ' -f2`"
+ echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
+ zconf_umount `hostname` $MOUNT2 -f
+ # evictor takes up to 2.25x to evict. But if there's a
+ # race to start the evictor from various obds, the loser
+ # might have to wait for the next ping.
+ echo Waiting for $(($TIMEOUT * 4)) secs
+ sleep $(($TIMEOUT * 4))
+ OST_NEXP2="`do_facet ost cat $OST_FILE | cut -d' ' -f2`"
+ MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
+ echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
+ [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
+ [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
+ return 0
+}
+run_test 26b "evict dead exports"
+
test_27() {
[ "`lsmod | grep mds`" ] || \
{ echo "skipping test 27 (non-local MDS)" && return 0; }
# Skip these tests
# bug number: 2766 9930
-ALWAYS_EXCEPT="0b 39 $REPLAY_SINGLE_EXCEPT"
+ALWAYS_EXCEPT="0b $REPLAY_SINGLE_EXCEPT"
gen_config() {
rm -f $XMLCONFIG
[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
SRC=/usr/lib/dbench/client_plain.txt
[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-[ ! -s $TGT ] && echo "$TGT doesn't exist" && exit 1
+[ ! -s $TGT ] && echo "$0: $TGT doesn't exist (SRC=$SRC)" && exit 1
cd $DIR
echo "running 'dbench $@' on $PWD at `date`"
dbench -c client.txt $@
+++ /dev/null
-#!/bin/sh
-
-SRCDIR="`dirname $0`"
-
-ENDRUN=endrun-`hostname`
-
-fail() {
- echo "ERROR: $1" 1>&2
- [ $2 ] && RC=$2 || RC=1
- exit $RC
-}
-
-export PATH=/sbin:/usr/sbin:$SRCDIR:$PATH
-
-cleanup() {
- trap 0
- $LCONF --cleanup $OPTS
-}
-
-[ "$COUNT" ] || COUNT=1000
-
-[ "$LCONF" ] || LCONF=$SRCDIR/../utils/lconf
-
-[ -z "$*" ] && fail "usage: $0 [--reformat] <conf>.xml" 1
-
-OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-if [ -z "$OSCMT" ]; then
- $LCONF $@ || exit 1
- trap cleanup EXIT
- OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
- [ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1
-fi
-
-V="-10"
-while [ "$1" ]; do
- case $1 in
- -v|--verbose) V="1";;
- --reformat) : ;;
- *) OPTS="$OPTS $1" ;;
- esac
- shift
-done
-
-OSCTMP=`echo $OSCMT | tr "/" "."`
-USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
-USED=`expr $USED + 16` # Some space for the status file
-
-THREADS=1
-while [ $THREADS -lt 196 ]; do
- echo "starting $THREADS threads at `date`"
- [ $V -gt 0 ] || echo 0 > /proc/sys/lnet/debug
- $SRCDIR/createdestroy /mnt/lustre/file-$$ $COUNT $V $THREADS
- $SRCDIR/openclose /mnt/lustre/file-$$ $COUNT $THREADS
- THREADS=`expr $THREADS + 5`
- $LCONF --cleanup $OPTS || fail 10
- $LCONF $OPTS || fail 11
-done
-
-rm -f $ENDRUN
-
-NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
-if [ $NOWUSED -gt $USED ]; then
- echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2
- echo "This is normal on BA OSTs, because of subdirectories." 1>&2
-fi
-
-cleanup
shift
done
-MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-if [ -z "$MOUNT" ]; then
+EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+if [ -z "$EXISTING_MOUNT" ]; then
sh llmount.sh $OPTS
- MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
- [ -z "$MOUNT" ] && fail "no lustre filesystem mounted" 1
+ EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+ [ -z "$EXISTING_MOUNT" ] && fail "no lustre filesystem mounted" 1
I_MOUNTED="yes"
fi
+MOUNT=$EXISTING_MOUNT
OSCTMP=`echo $MOUNT | tr "/" "."`
USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
USED=`expr $USED + 16` # Some space for the status file
# let's start slowly here...
-log "touching $MOUNT"
+START=`date +%s`
+log "touching $MOUNT at `date`"
touch $MOUNT || fail "can't touch $MOUNT" 2
HOSTS=$MOUNT/hosts.$$
# ok, that hopefully worked, so let's do a little more, with files that
# haven't changed in the last day (hopefully they don't change during test)
FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -n $COUNT`
-log "copying files from $SRC to $DST$SRC"
+log "copying files from $SRC to $DST$SRC at `date`"
tar cf - $FILES | tar xvf - -C $DST || fail "copying $SRC" 11
-log "comparing newly copied files"
+log "comparing newly copied files at `date`"
for f in $FILES; do
[ $V ] && log "verifying $DST/$f"
diff -q $f $DST/$f || ERROR=11
done
[ "$ERROR" ] && fail "old and new files are different" $ERROR
+log "finished at `date` ($(($(date +%s) - START)))"
sh llmountcleanup.sh || exit 19
sh llrmount.sh $OPTS || exit 20
build_test_filter
echo "preparing for tests involving mounts"
-EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP}
+EXT2_DEV=${EXT2_DEV:-$TMP/SANITY.LOOP}
touch $EXT2_DEV
mke2fs -j -F $EXT2_DEV 8000 > /dev/null
echo # add a newline after mke2fs.
mkdir $DIR/d22
chown $RUNAS_ID $DIR/d22
# Tar gets pissy if it can't access $PWD *sigh*
- (cd /tmp;
+ (cd $TMP;
$RUNAS tar cf - /etc/hosts /etc/sysconfig/network | \
$RUNAS tar xfC - $DIR/d22)
ls -lR $DIR/d22/etc
exhaust_all_precreations 0x215
sleep 5
- touch $DIR/d27/f27o && error
+ touch $DIR/d27/f27o && error "able to create $DIR/d27/f27o"
reset_enospc
}
done
}
+export CACHE_MAX=`cat /proc/fs/lustre/llite/*/max_cached_mb | head -n 1`
+cleanup_101() {
+ for s in $LPROC/llite/*/max_cached_mb; do
+ echo $CACHE_MAX > $s
+ done
+ trap 0
+}
+
test_101() {
local s
local discard
- local nreads
+ local nreads=10000
+ local cache_limit=32
- for s in $LPROC/osc/OSC_*/rpc_stats ;do
+ for s in $LPROC/osc/OSC_*/rpc_stats; do
echo 0 > $s
done
- for s in $LPROC/llite/*/read_ahead_stats ;do
- echo 0 > $s
+ trap cleanup_101 EXIT
+ for s in $LPROC/llite/fs*; do
+ echo 0 > $s/read_ahead_stats
+ echo $cache_limit > $s/max_cached_mb
done
#
- # randomly read 10000 of 64K chunks from 200M file.
+ # randomly read 10000 of 64K chunks from file 3x 32MB in size
#
- nreads=10000
- $RANDOM_READS -f $DIR/f101 -s200000000 -b65536 -C -n$nreads -t 180
+ echo "nreads: $nreads file size: $((cache_limit * 3))MB"
+ $RANDOM_READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180
discard=0
- for s in $LPROC/llite/*/read_ahead_stats ;do
- discard=$(($discard + $(cat $s | get_named_value 'read but discarded')))
+ for s in $LPROC/llite/fs*; do
+ discard=$(($discard + $(cat $s/read_ahead_stats | get_named_value 'read but discarded')))
done
+ cleanup_101
if [ $(($discard * 10)) -gt $nreads ] ;then
cat $LPROC/osc/OSC_*/rpc_stats
cat $LPROC/llite/*/read_ahead_stats
error "too many ($discard) discarded pages"
fi
- rm -f $DIR/f101 || true
+ rm -f $DIR/$tfile || true
}
-run_test 101 "check read-ahead for random reads ==========="
+run_test 101 "check read-ahead for random reads ================"
test_102() {
local testfile=$DIR/xattr_testfile
touch $testfile
[ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
- [ -z "`grep \<xattr\> $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
+ [ -z "`grep xattr $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
echo "set/get xattr..."
setfattr -n trusted.name1 -v value1 $testfile || error
[ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \
rm -f $testfile
}
-run_test 102 "user xattr test ====================="
+run_test 102 "user xattr test =================================="
run_acl_subtest()
{
[ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
[ -z "`mount | grep " $DIR .*\<acl\>"`" ] && echo "skipping $TESTNAME (must have acl)" && return
[ -z "`grep acl $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return
- $(which setfacl 2>/dev/null) || echo "skipping $TESTNAME (could not find setfacl)" && return
+ which setfacl 2>/dev/null || (echo "skipping $TESTNAME (could not find setfacl)" && return)
echo "performing cp ..."
run_acl_subtest cp || error
cd $SAVED_PWD
umask $SAVE_UMASK
}
-run_test 103 "==============acl test ============="
+run_test 103 "acl test ========================================="
test_104() {
touch $DIR/$tfile
lfs df || error "lfs df failed"
lfs df -ih || error "lfs df -ih failed"
- lfs df $DIR || error "lfs df $DIR failed"
- lfs df -ih $DIR || error "lfs df -ih $DIR failed"
+ lfs df -h $DIR || error "lfs df -h $DIR failed"
+ lfs df -i $DIR || error "lfs df -i $DIR failed"
lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed"
lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed"
lctl --device %$OSC recover
lfs df || error "lfs df with reactivated OSC failed"
}
-run_test 104 "lfs>df [-ih] [path] test ============"
+run_test 104 "lfs df [-ih] [path] test ========================="
TMPDIR=$OLDTMPDIR
TMP=$OLDTMP
}
run_test 23 " others should see updated atime while another read===="
+test_24() {
+ touch $DIR1/$tfile
+ lfs df || error "lfs df failed"
+ lfs df -ih || error "lfs df -ih failed"
+ lfs df -h $DIR1 || error "lfs df -h $DIR1 failed"
+ lfs df -i $DIR2 || error "lfs df -i $DIR2 failed"
+ lfs df $DIR1/$tfile || error "lfs df $DIR1/$tfile failed"
+ lfs df -ih $DIR2/$tfile || error "lfs df -ih $DIR2/$tfile failed"
+
+ OSC=`lctl dl | awk '/OSC.*MNT/ {print $4}' | head -n 1`
+ lctl --device %$OSC deactivate
+ lfs df -i || error "lfs df -i with deactivated OSC failed"
+ lctl --device %$OSC recover
+ lfs df || error "lfs df with reactivated OSC failed"
+}
+run_test 24 "lfs df [-ih] [path] test ========================="
+
+test_25() {
+ [ -z "`mount | grep " $DIR1 .*\<acl\>"`" ] && echo "skipping $TESTNAME ($DIR1 must have acl)" && return
+ [ -z "`mount | grep " $DIR2 .*\<acl\>"`" ] && echo "skipping $TESTNAME ($DIR2 must have acl)" && return
+
+ mkdir $DIR1/d25 || error
+ touch $DIR1/d25/f1 || error
+ chmod 0755 $DIR1/d25/f1 || error
+
+ $RUNAS checkstat $DIR2/d25/f1 || error
+ setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error
+ $RUNAS checkstat $DIR2/d25/f1 && error
+ setfacl -m u:$RUNAS_ID:r-x $DIR1/d25 || error
+ $RUNAS checkstat $DIR2/d25/f1 || error
+ setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error
+ $RUNAS checkstat $DIR2/d25/f1 && error
+ setfacl -x u:$RUNAS_ID: $DIR1/d25 || error
+ $RUNAS checkstat $DIR2/d25/f1 || error
+
+ rm -rf $DIR1/d25
+}
+run_test 25 "change ACL on one mountpoint be seen on another ==="
+
log "cleanup: ======================================================"
rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
def _get_val(self, k):
ret = None
+ if k == 'name':
+ k = 'lustreName'
if self._attrs.has_key(k):
v = self._attrs[k]
if type(v) == types.ListType:
else
progname++;
- if (strcmp(argv[1], "-d") == 0)
- debug = 1;
-
if (argc != 3) {
fprintf(stderr, "%s: bad parameter count\n", progname);
usage(stderr);
return EINVAL;
}
+
+ if (strcmp(argv[1], "-d") == 0)
+ debug = 1;
+
param->mgd_uid = strtoul(argv[2], &end, 0);
if (*end) {
fprintf(stderr, "%s: invalid uid '%s'\n", progname, argv[2]);
self.nspath = self.db.get_val('nspath', '')
self.mkfsoptions = '-i 4096 ' + self.db.get_val('mkfsoptions', '')
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
- self.quota = self.db.get_val('quota', '')
+ if config.quota:
+ self.quota = config.quota
+ else:
+ self.quota = self.db.get_val('quota', '')
# overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
target_uuid = self.db.get_first_ref('target')
mds = self.db.lookup(target_uuid)
self.journal_size = self.db.get_val_int('journalsize', 0)
# now as we store fids in EA on OST we need to make inode bigger
- self.inode_size = self.db.get_val_int('inodesize', 256)
+ self.inode_size = self.db.get_val_int('inodesize', 0)
+ if self.inode_size == 0:
+ self.inode_size = 256
self.mkfsoptions = self.db.get_val('mkfsoptions', '')
# Allocate fewer inodes on large OST devices. Most filesystems
# can be much more aggressive than this, but by default we can't.
if self.size > 1000000:
self.mkfsoptions = '-i 16384 ' + self.mkfsoptions
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
- self.quota = self.db.get_val('quota', '')
+ if config.quota:
+ self.quota = config.quota
+ else:
+ self.quota = self.db.get_val('quota', '')
self.fstype = self.db.get_val('fstype', '')
if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs':
else:
for srv in this_nets:
lctl.connect(srv)
- break
if srv:
lctl.add_conn(self.name, srv.nid_uuid);
# virtual interface for OSC and LOV
class VOSC(Module):
- def __init__(self, db, uuid, fs_name, name_override = None):
+ def __init__(self, db, uuid, fs_name, name_override = None, quota = None):
Module.__init__(self, 'VOSC', db)
+ if quota:
+ self.add_lustre_module('quota', 'lquota')
if db.get_class() == 'lov':
self.osc = LOV(db, uuid, fs_name, name_override)
else:
def cleanup(self):
self.osc.cleanup()
def load_module(self):
+ Module.load_module(self)
self.osc.load_module()
def cleanup_module(self):
self.osc.cleanup_module()
+ Module.cleanup_module(self)
class ECHO_CLIENT(Module):
self.fs_uuid = self.db.get_first_ref('filesystem')
fs = self.db.lookup(self.fs_uuid)
self.mds_uuid = fs.get_first_ref('mds')
+ mds_db = self.db.lookup(self.mds_uuid)
+ if config.quota:
+ quota = config.quota
+ else:
+ quota = mds_db.get_val('quota', config.quota)
self.obd_uuid = fs.get_first_ref('obd')
obd = self.db.lookup(self.obd_uuid)
client_uuid = generate_client_uuid(self.name)
- self.vosc = VOSC(obd, client_uuid, self.name)
+ self.vosc = VOSC(obd, client_uuid, self.name, quota=quota)
self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
- mds_db = self.db.lookup(self.mds_uuid)
- quota = mds_db.get_val('quota', '')
- if quota:
- self.add_lustre_module('quota', 'lquota')
self.add_lustre_module('mdc', 'mdc')
self.add_lustre_module('llite', 'llite')
PARAMLIST),
('user_xattr', """Enable user_xattr support on MDS""", FLAG, 0),
('acl', """Enable ACL support on MDS""", FLAG, 0),
+ ('quota', "Enable quota support for client file system", PARAM),
]
def main():
if (out_len > 0)
return 0;
-
+
fprintf(stderr, "error: lfs df: %s isn't mounted on lustre\n", path);
return -EINVAL;
}
static int showdf(char *mntdir, struct obd_statfs *stat,
- struct obd_uuid *uuid, int ishow, int cooked,
+ char *uuid, int ishow, int cooked,
char *type, int index, int rc)
{
__u64 avail, used, total;
double ratio = 0;
- int obd_type;
char *suffix = "KMGTPEZY";
char tbuf[10], ubuf[10], abuf[10], rbuf[10];
- if (!uuid || !stat || !type)
- return -EINVAL;
- if (!strncmp(type, "MDT", 3)) {
- obd_type = 0;
- } else if(!strncmp(type, "OST", 3)){
- obd_type = 1;
- } else {
- fprintf(stderr, "error: lfs df: invalid type '%s'\n", type);
+ if (!uuid || !stat)
return -EINVAL;
- }
- if (rc == 0) {
+ switch (rc) {
+ case 0:
if (ishow) {
avail = stat->os_ffree;
used = stat->os_files - stat->os_ffree;
total = stat->os_files;
} else {
- avail = stat->os_bavail * stat->os_bsize / 1024;
+ int shift = cooked ? 0 : 10;
+
+ avail = (stat->os_bavail * stat->os_bsize) >> shift;
used = stat->os_blocks - stat->os_bavail;
- used = used * stat->os_bsize / 1024;
- total = stat->os_blocks * stat->os_bsize / 1024;
+ used = (used * stat->os_bsize) >> shift;
+ total = (stat->os_blocks * stat->os_bsize) >> shift;
}
if (total > 0)
if (cooked) {
int i;
- double total_d, used_d, avail_d;
-
- total_d = (double)total;
- i = COOK(total_d);
+ double cook_val;
+
+ cook_val = (double)total;
+ i = COOK(cook_val);
if (i > 0)
- sprintf(tbuf, HDF"%c", total_d, suffix[i - 1]);
+ sprintf(tbuf, HDF"%c", cook_val, suffix[i - 1]);
else
sprintf(tbuf, CDF, total);
- used_d = (double)used;
- i = COOK(used_d);
+ cook_val = (double)used;
+ i = COOK(cook_val);
if (i > 0)
- sprintf(ubuf, HDF"%c", used_d, suffix[i - 1]);
+ sprintf(ubuf, HDF"%c", cook_val, suffix[i - 1]);
else
sprintf(ubuf, CDF, used);
- avail_d = (double)avail;
- i = COOK(avail_d);
+ cook_val = (double)avail;
+ i = COOK(cook_val);
if (i > 0)
- sprintf(abuf, HDF"%c", avail_d, suffix[i - 1]);
+ sprintf(abuf, HDF"%c", cook_val, suffix[i - 1]);
else
sprintf(abuf, CDF, avail);
} else {
}
sprintf(rbuf, RDF, (int)(ratio * 100));
- if (obd_type == 0)
- printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[MDT:%d]\n",
- (char *)uuid, tbuf, ubuf, abuf, rbuf,
- mntdir, index);
+ printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s",
+ uuid, tbuf, ubuf, abuf, rbuf, mntdir);
+ if (type)
+ printf("[%s:%d]\n", type, index);
else
- printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[OST:%d]\n",
- (char *)uuid, tbuf, ubuf, abuf, rbuf,
- mntdir, index);
+ printf("\n");
- return 0;
- }
- switch (rc) {
+ break;
case -ENODATA:
- printf(UUF": inactive OST\n", (char *)uuid);
+ printf(UUF": inactive device\n", uuid);
break;
default:
- printf(UUF": %s\n", (char *)uuid, strerror(-rc));
+ printf(UUF": %s\n", uuid, strerror(-rc));
break;
}
static int mntdf(char *mntdir, int ishow, int cooked)
{
- struct obd_statfs stat_buf;
+ struct obd_statfs stat_buf, sum = { .os_bsize = 1 };
struct obd_uuid uuid_buf;
__u32 index;
- __u64 avail_sum, used_sum, total_sum;
- char tbuf[10], ubuf[10], abuf[10], rbuf[10];
- double ratio_sum = 0;
int rc;
if (ishow)
"IUse%", "Mounted on");
else
printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s\n",
- "UUID", "1K-blocks", "Used", "Available",
- "Use%", "Mounted on");
+ "UUID", cooked ? "bytes" : "1K-blocks",
+ "Used", "Available", "Use%", "Mounted on");
- avail_sum = total_sum = 0;
for (index = 0; ; index++) {
memset(&stat_buf, 0, sizeof(struct obd_statfs));
memset(&uuid_buf, 0, sizeof(struct obd_uuid));
if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO ||
rc == -ENODATA || rc == 0) {
- showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked,
+ showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked,
"MDT", index, rc);
} else {
fprintf(stderr,
uuid_buf.uuid, strerror(-rc), rc);
return rc;
}
- if (!rc && ishow) {
- avail_sum += stat_buf.os_ffree;
- total_sum += stat_buf.os_files;
+ if (rc == 0) {
+ sum.os_ffree += stat_buf.os_ffree;
+ sum.os_files += stat_buf.os_files;
}
}
- for (index = 0;;index++) {
+ for (index = 0; ; index++) {
memset(&stat_buf, 0, sizeof(struct obd_statfs));
memset(&uuid_buf, 0, sizeof(struct obd_uuid));
rc = llapi_obd_statfs(mntdir, LL_STATFS_LOV, index,
if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO ||
rc == -ENODATA || rc == 0) {
- showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked,
+ showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked,
"OST", index, rc);
} else {
fprintf(stderr,
strerror(-rc), rc);
return rc;
}
- if (!rc && !ishow) {
- __u64 avail, total;
- avail = stat_buf.os_bavail * stat_buf.os_bsize;
- avail /= 1024;
- total = stat_buf.os_blocks * stat_buf.os_bsize;
- total /= 1024;
-
- avail_sum += avail;
- total_sum += total;
+ if (rc == 0) {
+ sum.os_blocks += stat_buf.os_blocks * stat_buf.os_bsize;
+ sum.os_bfree += stat_buf.os_bfree * stat_buf.os_bsize;
+ sum.os_bavail += stat_buf.os_bavail * stat_buf.os_bsize;
}
}
- used_sum = total_sum - avail_sum;
- if (total_sum > 0)
- ratio_sum = (double)(total_sum - avail_sum) / (double)total_sum;
- sprintf(rbuf, RDF, (int)(ratio_sum * 100));
- if (cooked) {
- int i;
- char *suffix = "KMGTPEZY";
- double total_sum_d, used_sum_d, avail_sum_d;
-
- total_sum_d = (double)total_sum;
- i = COOK(total_sum_d);
- if (i > 0)
- sprintf(tbuf, HDF"%c", total_sum_d, suffix[i - 1]);
- else
- sprintf(tbuf, CDF, total_sum);
-
- used_sum_d = (double)used_sum;
- i = COOK(used_sum_d);
- if (i > 0)
- sprintf(ubuf, HDF"%c", used_sum_d, suffix[i - 1]);
- else
- sprintf(ubuf, CDF, used_sum);
-
- avail_sum_d = (double)avail_sum;
- i = COOK(avail_sum_d);
- if (i > 0)
- sprintf(abuf, HDF"%c", avail_sum_d, suffix[i - 1]);
- else
- sprintf(abuf, CDF, avail_sum);
- } else {
- sprintf(tbuf, CDF, total_sum);
- sprintf(ubuf, CDF, used_sum);
- sprintf(abuf, CDF, avail_sum);
- }
-
- printf("\n"UUF" "CSF" "CSF" "CSF" "RSF" %-s\n",
- "filesystem summary:", tbuf, ubuf, abuf, rbuf, mntdir);
+ printf("\n");
+ showdf(mntdir, &sum, "filesystem summary:", ishow, cooked, NULL, 0,0);
return 0;
}
int nomtab;
int fake;
int force;
+int retry;
static char *progname = NULL;
+#define MAX_RETRIES 99
void usage(FILE *out)
{
"\t-v|--verbose: print verbose config settings\n"
"\t-o: filesystem mount options:\n"
"\t\tflock/noflock: enable/disable flock support\n"
+ "\t\troute=<gw>[-<gw>]:<low>[-<high>]: portal route to MDS\n"
"\t\tuser_xattr/nouser_xattr: enable/disable user extended "
"attributes\n"
);
fprintf(stderr, "%s: addmntent: %s:",
progname, strerror (errno));
rc = 16;
+ } else if (verbose > 1) {
+ fprintf(stderr, "%s: added %s on %s to %s\n",
+ progname, spec, mtpt, MOUNTED);
}
endmntent(fp);
}
fprintf(out, "mds name: %s\n", lmd->lmd_mds);
fprintf(out, "profile: %s\n", lmd->lmd_profile);
fprintf(out, "options: %s\n", options);
+ fprintf(out, "retry: %d\n", retry);
return 0;
}
if ((opteq = strchr(opt, '='))) {
val = atoi(opteq + 1);
*opteq = '\0';
- if (0) {
- /* All the network options have gone :)) */
+ if (!strcmp(opt, "retry")) {
+ if (val >= 0 || val < MAX_RETRIES)
+ retry = val;
+ else
+ retry = 0;
} else {
fprintf(stderr, "%s: unknown option '%s'. "
"Ignoring.\n", progname, opt);
switch (opt) {
case 1:
++force;
- printf("force: %d\n", force);
+ if (verbose)
+ printf("force: %d\n", force);
nargs++;
break;
case 'f':
++fake;
- printf("fake: %d\n", fake);
+ if (verbose)
+ printf("fake: %d\n", fake);
nargs++;
break;
case 'h':
break;
case 'n':
++nomtab;
- printf("nomtab: %d\n", nomtab);
+ if (verbose)
+ printf("nomtab: %d\n", nomtab);
nargs++;
break;
case 'o':
return 1;
}
- if (!fake)
- rc = mount(source, target, "lustre", flags, (void *)&lmd);
+ if (!fake) {
+ FILE *modpipe = popen("/sbin/modprobe -q llite", "r");
+ if (modpipe != NULL)
+ pclose(modpipe);
+ /* use <= to include the initial mount before we retry */
+ for (i = 0, rc = -EAGAIN; i <= retry && rc != 0; i++)
+ rc = mount(source, target, "lustre", flags, &lmd);
+ }
if (rc) {
fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname,
source, target, strerror(errno));
print_options(stderr, &lmd, options);
- if (errno == ENODEV)
+ if (errno == ENODEV) {
+ struct utsname unamebuf;
+ char *modfile = "/etc/modutils.conf";
+
+ if (uname(&unamebuf) == 0 &&
+ strncmp(unamebuf.release, "2.4", 3) == 0)
+ modfile = "/etc/modules.conf";
+
fprintf(stderr, "Are the lustre modules loaded?\n"
- "Check /etc/modules.conf and /proc/filesystems\n");
+ "Check %s and /proc/filesystems\n");
+ }
rc = 32;
} else if (!nomtab) {
rc = update_mtab_entry(source, target, "lustre", options,0,0,0);
('mdsuuid', "Optional argument to specify MDS UUID", PARAM,""),
('nspath', "Local mount point of server namespace.", PARAM,""),
('format', ""),
- ('quota', "quotaon:enable quota, only u|g|ug is supported now. \
- iunit: the unit for slave to acquire/release inode quota from/to masteri.\
- Int type (>0), default value in Lustre is 5000 inodes.\
- bunit: the unit for slave to acquire/release block quota from/to master.\
- Mbytes (>0), default value in Lustre is 100(Mbytes).\
- itune: used to tune the threthold. When inode quota usage reach the threthold,\
- slave should acquire/release inode quota from/to master.\
- Int type (100 > btune > 0), default value in Lustre is 50 (percentge).\
- inode threthold = iunit * itune / 100.\
- btune: used to tune the threthold. When block quota usage reach the threthold,\
- slave should acquire/release block quota from/to master.\
- Int type (100 > btune > 0), default value in Lustre is 50 (percentage).\
- block threthold = bunit * btune / 100.", PARAM,""),
+ ('quota', """
+ quotaon: enable quota, only u|g|ug is supported now.
+ iunit: the unit for slave to acquire/release inode quota from/to master.
+ Int type (>0), default value in Lustre is 5000 inodes.
+ bunit: the unit for slave to acquire/release block quota from/to master.
+ Mbytes (>0), default value in Lustre is 100(Mbytes).
+ itune: used to tune the threthold. When inode quota usage reach the threthold,
+ slave should acquire/release inode quota from/to master.
+ Int type (100 > btune > 0), default value in Lustre is 50 (percentge).
+ inode threthold = iunit * itune / 100.
+ btune: used to tune the threthold. When block quota usage reach the threthold,
+ slave should acquire/release block quota from/to master.
+ Int type (100 > btune > 0), default value in Lustre is 50 (percentage).
+ block threthold = bunit * btune / 100.""", PARAM,""),
# clients: mountpoint and echo
('echo_client', "", PARAM),
('path', "Specify the mountpoint for Lustre.", PARAM),
#!/bin/sh
-rmmod llite
-rmmod mdc
-rmmod lov
-rmmod osc
-rmmod obdfilter
-rmmod fsfilt_ext3
-rmmod fsfilt_ldiskfs
-rmmod ldiskfs
-rmmod ost
-rmmod mds
-rmmod ptlrpc
-rmmod obdclass
-rmmod lvfs
-rmmod ksocklnd
-rmmod lnet
-rmmod libcfs
+SRCDIR=`dirname $0`
+PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+
+lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1
+# do it again, in case we tried to unload ksocklnd too early
+lctl modules | awk '{ print $2 }' | xargs rmmod
CHECK_VALUE(FMODE_READ);
CHECK_VALUE(FMODE_WRITE);
- CHECK_VALUE(FMODE_EXEC);
+ CHECK_VALUE(MDS_FMODE_EXEC);
CHECK_CDEFINE(MDS_OPEN_CREAT);
CHECK_CDEFINE(MDS_OPEN_EXCL);
(long long)FMODE_READ);
LASSERTF(FMODE_WRITE == 2, " found %lld\n",
(long long)FMODE_WRITE);
- LASSERTF(FMODE_EXEC == 4, " found %lld\n",
- (long long)FMODE_EXEC);
+ LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n",
+ (long long)MDS_FMODE_EXEC);
CLASSERT(MDS_OPEN_CREAT == 00000100);
CLASSERT(MDS_OPEN_EXCL == 00000200);
CLASSERT(MDS_OPEN_TRUNC == 00001000);