LU-1337 llite: kernel 3.1 kills inode->i_alloc_sem

author Liu Xuezhao <xuezhao.liu@emc.com>

Thu, 27 Sep 2012 06:20:25 +0000 (14:20 +0800)

committer Oleg Drokin <green@whamcloud.com>

Fri, 2 Nov 2012 19:30:33 +0000 (15:30 -0400)
author Liu Xuezhao <xuezhao.liu@emc.com>
Thu, 27 Sep 2012 06:20:25 +0000 (14:20 +0800)
committer Oleg Drokin <green@whamcloud.com>
Fri, 2 Nov 2012 19:30:33 +0000 (15:30 -0400)
diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4

index 9798be0..fa0d526 100644 (file)
--- a/lustre/autoconf/lustre-core.m4
+++ b/lustre/autoconf/lustre-core.m4
@@ -1794,40 +1794,61 @@ LB_LINUX_TRY_COMPILE([
  ])
  
  #
-# 3.1.1 has ext4_blocks_for_truncate
+# 3.1 renames lock-manager ops(lock_manager_operations) from fl_xxx to lm_xxx
+# see kernel commit 8fb47a4fbf858a164e973b8ea8ef5e83e61f2e50
  #
-AC_DEFUN([LC_BLOCKS_FOR_TRUNCATE],
-[AC_MSG_CHECKING([if kernel has ext4_blocks_for_truncate])
+AC_DEFUN([LC_LM_XXX_LOCK_MANAGER_OPS],
+[AC_MSG_CHECKING([if lock-manager ops renamed to lm_xxx])
  LB_LINUX_TRY_COMPILE([
         #include <linux/fs.h>
-       #include "$LINUX/fs/ext4/ext4_jbd2.h"
-       #include "$LINUX/fs/ext4/truncate.h"
  ],[
-       ext4_blocks_for_truncate(NULL);
+       struct lock_manager_operations lm_ops;
+       lm_ops.lm_compare_owner = NULL;
  ],[
+       AC_DEFINE(HAVE_LM_XXX_LOCK_MANAGER_OPS, 1,
+                 [lock-manager ops renamed to lm_xxx])
         AC_MSG_RESULT([yes])
-       AC_DEFINE(HAVE_BLOCKS_FOR_TRUNCATE, 1,
-                 [kernel has ext4_blocks_for_truncate])
  ],[
         AC_MSG_RESULT([no])
  ])
  ])
  
  #
-# 3.1 renames lock-manager ops(lock_manager_operations) from fl_xxx to lm_xxx
-# see kernel commit 8fb47a4fbf858a164e973b8ea8ef5e83e61f2e50
+# 3.1 kills inode->i_alloc_sem, use i_dio_count and inode_dio_wait/
+#     inode_dio_done instead.
+# see kernel commit bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3
  #
-AC_DEFUN([LC_LM_XXX_LOCK_MANAGER_OPS],
-[AC_MSG_CHECKING([if lock-manager ops renamed to lm_xxx])
+AC_DEFUN([LC_INODE_DIO_WAIT],
+[AC_MSG_CHECKING([if inode->i_alloc_sem is killed and use inode_dio_wait/done.])
  LB_LINUX_TRY_COMPILE([
         #include <linux/fs.h>
  ],[
-       struct lock_manager_operations lm_ops;
-       lm_ops.lm_compare_owner = NULL;
+       inode_dio_wait((struct inode *)0);
+       inode_dio_done((struct inode *)0);
+],[
+       AC_DEFINE(HAVE_INODE_DIO_WAIT, 1,
+                 [inode->i_alloc_sem is killed and use inode_dio_wait/done])
+       AC_MSG_RESULT([yes])
+],[
+       AC_MSG_RESULT([no])
+])
+])
+
+#
+# 3.1.1 has ext4_blocks_for_truncate
+#
+AC_DEFUN([LC_BLOCKS_FOR_TRUNCATE],
+[AC_MSG_CHECKING([if kernel has ext4_blocks_for_truncate])
+LB_LINUX_TRY_COMPILE([
+       #include <linux/fs.h>
+       #include "$LINUX/fs/ext4/ext4_jbd2.h"
+       #include "$LINUX/fs/ext4/truncate.h"
+],[
+       ext4_blocks_for_truncate(NULL);
  ],[
-       AC_DEFINE(HAVE_LM_XXX_LOCK_MANAGER_OPS, 1,
-                 [lock-manager ops renamed to lm_xxx])
         AC_MSG_RESULT([yes])
+       AC_DEFINE(HAVE_BLOCKS_FOR_TRUNCATE, 1,
+                 [kernel has ext4_blocks_for_truncate])
  ],[
         AC_MSG_RESULT([no])
  ])
@@ -2013,11 +2034,12 @@ AC_DEFUN([LC_PROG_LINUX],
           LC_REQUEST_QUEUE_UNPLUG_FN
          LC_HAVE_FSTYPE_MOUNT
  
-        # 3.1.1
-        LC_BLOCKS_FOR_TRUNCATE
-
          # 3.1
          LC_LM_XXX_LOCK_MANAGER_OPS
+        LC_INODE_DIO_WAIT
+
+        # 3.1.1
+        LC_BLOCKS_FOR_TRUNCATE
  
          # 3.3
          LC_HAVE_MIGRATE_HEADER
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h

index 6173836..9362d9f 100644 (file)
--- a/lustre/include/linux/lustre_compat25.h
+++ b/lustre/include/linux/lustre_compat25.h
@@ -172,13 +172,17 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
                  (type *)( (char *)__mptr - offsetof(type,member) );})
  #endif
  
-#define UP_WRITE_I_ALLOC_SEM(i)   up_write(&(i)->i_alloc_sem)
-#define DOWN_WRITE_I_ALLOC_SEM(i) down_write(&(i)->i_alloc_sem)
-#define LASSERT_I_ALLOC_SEM_WRITE_LOCKED(i) LASSERT(down_read_trylock(&(i)->i_alloc_sem) == 0)
-
-#define UP_READ_I_ALLOC_SEM(i)    up_read(&(i)->i_alloc_sem)
-#define DOWN_READ_I_ALLOC_SEM(i)  down_read(&(i)->i_alloc_sem)
-#define LASSERT_I_ALLOC_SEM_READ_LOCKED(i) LASSERT(down_write_trylock(&(i)->i_alloc_sem) == 0)
+#ifdef HAVE_INODE_DIO_WAIT
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)       do {} while (0) /* for write unlock */
+# define inode_dio_read(i)             atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+#else
+# define inode_dio_wait(i)             down_write(&(i)->i_alloc_sem)
+# define inode_dio_write_done(i)       up_write(&(i)->i_alloc_sem)
+# define inode_dio_read(i)             down_read(&(i)->i_alloc_sem)
+# define inode_dio_done(i)             up_read(&(i)->i_alloc_sem)
+#endif
  
  #include <linux/mpage.h>        /* for generic_writepages */
  
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index f1aa2af..2a0d6d6 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -1465,12 +1465,12 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr)
  
         if (!S_ISDIR(inode->i_mode)) {
                 if (ia_valid & ATTR_SIZE)
-                       UP_WRITE_I_ALLOC_SEM(inode);
+                       inode_dio_write_done(inode);
                 mutex_unlock(&inode->i_mutex);
                 cfs_down_write(&lli->lli_trunc_sem);
                 mutex_lock(&inode->i_mutex);
                 if (ia_valid & ATTR_SIZE)
-                       DOWN_WRITE_I_ALLOC_SEM(inode);
+                       inode_dio_wait(inode);
         }
  
         /* We need a steady stripe configuration for setattr to avoid
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

index f8b28b2..a656e71 100644 (file)
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -292,9 +292,9 @@ static int vvp_io_setattr_iter_init(const struct lu_env *env,
          * This last one is especially bad for racing o_append users on other
          * nodes.
          */
-       mutex_unlock(&inode->i_mutex);
         if (cl_io_is_trunc(ios->cis_io))
-               UP_WRITE_I_ALLOC_SEM(inode);
+               inode_dio_write_done(inode);
+       mutex_unlock(&inode->i_mutex);
         cio->u.setattr.cui_locks_released = 1;
         return 0;
  }
@@ -346,7 +346,7 @@ static int vvp_io_setattr_trunc(const struct lu_env *env,
                                  const struct cl_io_slice *ios,
                                  struct inode *inode, loff_t size)
  {
-       DOWN_WRITE_I_ALLOC_SEM(inode);
+       inode_dio_wait(inode);
         return 0;
  }
  
@@ -418,7 +418,7 @@ static void vvp_io_setattr_fini(const struct lu_env *env,
         if (cio->u.setattr.cui_locks_released) {
                 mutex_lock(&inode->i_mutex);
                 if (cl_io_is_trunc(io))
-                       DOWN_WRITE_I_ALLOC_SEM(inode);
+                       inode_dio_wait(inode);
                 cio->u.setattr.cui_locks_released = 0;
         }
         vvp_io_fini(env, ios);
@@ -659,18 +659,19 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
  static int vvp_io_fault_start(const struct lu_env *env,
                                const struct cl_io_slice *ios)
  {
-        struct vvp_io       *vio     = cl2vvp_io(env, ios);
-        struct cl_io        *io      = ios->cis_io;
-        struct cl_object    *obj     = io->ci_obj;
-        struct inode        *inode   = ccc_object_inode(obj);
-        struct cl_fault_io  *fio     = &io->u.ci_fault;
-        struct vvp_fault_io *cfio    = &vio->u.fault;
-        loff_t               offset;
-        int                  result  = 0;
-        cfs_page_t          *vmpage  = NULL;
-        struct cl_page      *page;
-        loff_t               size;
-        pgoff_t              last; /* last page in a file data region */
+       struct vvp_io       *vio     = cl2vvp_io(env, ios);
+       struct cl_io        *io      = ios->cis_io;
+       struct cl_object    *obj     = io->ci_obj;
+       struct inode        *inode   = ccc_object_inode(obj);
+       struct ll_inode_info *lli    = ll_i2info(inode);
+       struct cl_fault_io  *fio     = &io->u.ci_fault;
+       struct vvp_fault_io *cfio    = &vio->u.fault;
+       loff_t               offset;
+       int                  result  = 0;
+       cfs_page_t          *vmpage  = NULL;
+       struct cl_page      *page;
+       loff_t               size;
+       pgoff_t              last; /* last page in a file data region */
  
          if (fio->ft_executable &&
              LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
@@ -685,30 +686,32 @@ static int vvp_io_fault_start(const struct lu_env *env,
          if (result != 0)
                  return result;
  
-        /* must return locked page */
-        if (fio->ft_mkwrite) {
-               /* we grab alloc_sem to exclude truncate case.
+       /* must return locked page */
+       if (fio->ft_mkwrite) {
+               /* we grab lli_trunc_sem to exclude truncate case.
                  * Otherwise, we could add dirty pages into osc cache
                  * while truncate is on-going. */
-               DOWN_READ_I_ALLOC_SEM(inode);
-
-                LASSERT(cfio->ft_vmpage != NULL);
-                lock_page(cfio->ft_vmpage);
-        } else {
-                result = vvp_io_kernel_fault(cfio);
-                if (result != 0)
-                        return result;
-        }
+               cfs_down_read(&lli->lli_trunc_sem);
+
+               LASSERT(cfio->ft_vmpage != NULL);
+               lock_page(cfio->ft_vmpage);
+       } else {
+               result = vvp_io_kernel_fault(cfio);
+               if (result != 0)
+                       return result;
+       }
  
-        vmpage = cfio->ft_vmpage;
-        LASSERT(PageLocked(vmpage));
+       vmpage = cfio->ft_vmpage;
+       LASSERT(PageLocked(vmpage));
  
-        if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
-                ll_invalidate_page(vmpage);
+       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+               ll_invalidate_page(vmpage);
  
+       size = i_size_read(inode);
          /* Though we have already held a cl_lock upon this page, but
           * it still can be truncated locally. */
-        if (unlikely(vmpage->mapping == NULL)) {
+       if (unlikely((vmpage->mapping != inode->i_mapping) ||
+                    (page_offset(vmpage) > size))) {
                  CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
  
                  /* return +1 to stop cl_io_loop() and ll_fault() will catch
@@ -756,7 +759,6 @@ static int vvp_io_fault_start(const struct lu_env *env,
                  }
          }
  
-        size = i_size_read(inode);
          last = cl_index(obj, size - 1);
          LASSERT(fio->ft_index <= last);
          if (fio->ft_index == last)
@@ -772,11 +774,11 @@ static int vvp_io_fault_start(const struct lu_env *env,
          EXIT;
  
  out:
-        /* return unlocked vmpage to avoid deadlocking */
+       /* return unlocked vmpage to avoid deadlocking */
         if (vmpage != NULL)
                 unlock_page(vmpage);
         if (fio->ft_mkwrite)
-               UP_READ_I_ALLOC_SEM(inode);
+               cfs_up_read(&lli->lli_trunc_sem);
  #ifdef HAVE_VM_OP_FAULT
         cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
  #endif
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c

index ffb4464..e200c05 100644 (file)
--- a/lustre/llite/vvp_page.c
+++ b/lustre/llite/vvp_page.c
@@ -429,7 +429,6 @@ static void vvp_transient_page_verify(const struct cl_page *page)
         struct inode *inode = ccc_object_inode(page->cp_obj);
  
         LASSERT(!mutex_trylock(&inode->i_mutex));
-       /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */
  }
  
  static int vvp_transient_page_own(const struct lu_env *env,
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c

index 061fdea..519ed39 100644 (file)
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -2722,9 +2722,9 @@ void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
   * The caller must have called osc_cache_writeback_range() to issue IO
   * otherwise it will take a long time for this function to finish.
   *
- * Caller must hold inode_mutex and i_alloc_sem, or cancel exclusive
- * dlm lock so that nobody else can dirty this range of file while we're
- * waiting for extents to be written.
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
   */
  int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
                          pgoff_t start, pgoff_t end)
diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c

index 2630fe8..0e19e80 100644 (file)
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -448,7 +448,6 @@ struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw)
  /*
   * there are following "locks":
   * journal_start
- * i_alloc_sem
   * i_mutex
   * page lock
author	Liu Xuezhao <xuezhao.liu@emc.com>
	Thu, 27 Sep 2012 06:20:25 +0000 (14:20 +0800)
committer	Oleg Drokin <green@whamcloud.com>
	Fri, 2 Nov 2012 19:30:33 +0000 (15:30 -0400)
lustre/autoconf/lustre-core.m4		patch \| blob \| history
lustre/include/linux/lustre_compat25.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/vvp_io.c		patch \| blob \| history
lustre/llite/vvp_page.c		patch \| blob \| history
lustre/osc/osc_cache.c		patch \| blob \| history
lustre/osd-ldiskfs/osd_io.c		patch \| blob \| history