Whamcloud - gitweb
land b1_5 onto HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-multi-mount-protection-2.6.18-vanilla.patch
diff --git a/lustre/kernel_patches/patches/ext3-multi-mount-protection-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/ext3-multi-mount-protection-2.6.18-vanilla.patch
new file mode 100644 (file)
index 0000000..989ca26
--- /dev/null
@@ -0,0 +1,381 @@
+Index: mmp/fs/ext3/al.h
+===================================================================
+--- /dev/null  1970-01-01 00:00:00.000000000 +0000
++++ mmp/fs/ext3/al.h   2006-07-18 20:43:51.000000000 +0800
+@@ -0,0 +1,11 @@
++/*
++ * (C) 2006  Qi Yong <qiyong@clusterfs.com>
++ */
++
++#define       ALIVE_MAGIC     0xA1153C29
++struct alive_struct {
++      __le32  al_magic;
++      __le32  al_seq;
++      __le32  al_time;
++      char    al_nodename[65];
++};
+Index: mmp/fs/ext3/namei.c
+===================================================================
+--- mmp.orig/fs/ext3/namei.c   2006-07-18 20:43:51.000000000 +0800
++++ mmp/fs/ext3/namei.c        2006-07-18 20:43:51.000000000 +0800
+@@ -805,7 +805,7 @@ static inline int search_dirblock(struct
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
+-static struct buffer_head * ext3_find_entry (struct dentry *dentry,
++struct buffer_head * ext3_find_entry (struct dentry *dentry,
+                                       struct ext3_dir_entry_2 ** res_dir)
+ {
+       struct super_block * sb;
+Index: mmp/fs/ext3/super.c
+===================================================================
+--- mmp.orig/fs/ext3/super.c   2006-07-18 20:43:51.000000000 +0800
++++ mmp/fs/ext3/super.c        2006-07-18 23:49:54.000000000 +0800
+@@ -35,12 +35,14 @@
+ #include <linux/namei.h>
+ #include <linux/quotaops.h>
+ #include <linux/seq_file.h>
++#include <linux/kthread.h>
+ #include <asm/uaccess.h>
+ #include "xattr.h"
+ #include "acl.h"
+ #include "namei.h"
++#include "al.h"
+ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
+                            unsigned long journal_devnum);
+@@ -61,6 +63,8 @@ static int ext3_statfs (struct dentry * 
+ static void ext3_unlockfs(struct super_block *sb);
+ static void ext3_write_super (struct super_block * sb);
+ static void ext3_write_super_lockfs(struct super_block *sb);
++struct buffer_head * ext3_find_entry (struct dentry *dentry,
++                                      struct ext3_dir_entry_2 ** res_dir);
+ /* 
+  * Wrappers for journal_start/end.
+@@ -434,6 +438,9 @@ static void ext3_put_super (struct super
+               invalidate_bdev(sbi->journal_bdev, 0);
+               ext3_blkdev_remove(sbi);
+       }
++      if (sbi->s_alive_tsk)
++              kthread_stop(sbi->s_alive_tsk);
++
+       sb->s_fs_info = NULL;
+       kfree(sbi);
+       return;
+@@ -1374,6 +1381,261 @@ static ext3_fsblk_t descriptor_loc(struc
+       return (has_super + ext3_group_first_block_no(sb, bg));
+ }
++static int write_alive(struct buffer_head * bh)
++{
++      lock_buffer(bh);
++      bh->b_end_io = end_buffer_write_sync;
++      get_bh(bh);
++      submit_bh(WRITE, bh);
++      wait_on_buffer(bh);
++      if (unlikely(!buffer_uptodate(bh)))
++              return 1;
++      return 0;
++}
++
++static int read_alive_again(struct buffer_head * bh)
++{
++      lock_buffer(bh);
++      bh->b_end_io = end_buffer_read_sync;
++      get_bh(bh);
++      submit_bh(READ, bh);
++      wait_on_buffer(bh);
++      if (!buffer_uptodate(bh)) {
++              brelse(bh);
++              return 1;
++      }
++      return 0;
++}
++
++/*
++ * The caller must have a ref on the buffer_head.
++ */
++static int kalived(void *data)
++{
++      struct buffer_head * bh;
++      struct alive_struct * alive;
++      char b[BDEVNAME_SIZE];
++      u32 seq = 0;
++
++      bh = (struct buffer_head *)data;
++      bdevname(bh->b_bdev, b);
++
++      alive = (struct alive_struct *)(bh->b_data);
++      alive->al_magic = cpu_to_le32(ALIVE_MAGIC);
++      alive->al_time = cpu_to_le32(get_seconds());
++
++      down_read(&uts_sem);
++      memcpy(alive->al_nodename, system_utsname.nodename, 65);
++      up_read(&uts_sem);
++
++      while (!kthread_should_stop()) {
++              if (++seq == 0)
++                      ++seq;
++
++              alive->al_seq = cpu_to_le32(seq);
++              alive->al_time = cpu_to_le32(get_seconds());
++
++              if (unlikely(write_alive(bh))) {
++                      /* panic here? */
++                      printk(KERN_ERR "Alive (device %s): "
++                              "can't write alive block\n", b);
++                      continue;
++              }
++
++              schedule_timeout_interruptible(5 * HZ);
++      }
++
++      alive->al_seq = 0;
++      alive->al_time = cpu_to_le32(get_seconds());
++
++      if (unlikely(write_alive(bh)))
++              printk(KERN_ERR "Alive (device %s): "
++                      "can't reset alive block\n", b);
++      brelse(bh);
++      return 0;
++}
++
++static unsigned long get_alive_ino(struct super_block *sb)
++{
++      unsigned long   ino = 0;
++      struct dentry   alive;
++      struct dentry   * root;
++      struct inode    * root_inode;
++      struct ext3_dir_entry_2 * de;
++      struct buffer_head      * bh;
++
++      root_inode = iget(sb, EXT3_ROOT_INO);
++      root = d_alloc_root(root_inode);
++      if (!root) {
++              printk(KERN_ERR "Alive (device %s): get root inode failed\n",
++                      sb->s_id);
++              iput(root_inode);
++              goto out;
++      }
++
++      alive.d_name.name = ".alive";
++      alive.d_name.len = 6;
++      alive.d_parent = root;
++
++      bh = ext3_find_entry(&alive, &de);
++      dput(root);
++
++      if (!bh) {
++              printk(KERN_WARNING "Alive (device %s): alive lookup failed\n",
++                      sb->s_id);
++              goto out;
++      }
++
++      ino = le32_to_cpu(de->inode);
++      brelse (bh);
++      pr_debug("Alive (device %s): alive_ino=%lu\n", sb->s_id, ino);
++out:
++      return ino;
++}
++
++/* check alive file */
++static int check_alive(struct super_block *sb, struct ext3_sb_info *sbi)
++{
++      unsigned long           ino;
++      struct buffer_head      * bh;
++      struct ext3_inode_info  * ei;
++      struct inode            * alive_inode;
++      struct alive_struct     * alive;
++      u32 alive_block;
++      u32 seq;
++
++      ino = get_alive_ino(sb);
++      if (!ino)
++              goto failed;
++
++      alive_inode = iget(sb, ino);
++      if (!alive_inode) {
++              iput(alive_inode);
++              printk(KERN_ERR "Alive (device %s): get alive inode failed\n",
++                      sb->s_id);
++              goto failed;
++      }
++      if (!alive_inode->i_nlink) {
++              make_bad_inode(alive_inode);
++              iput(alive_inode);
++              printk(KERN_ERR "Alive (device %s): alive inode is deleted\n",
++                      sb->s_id);
++              goto failed;
++      }
++      if (!S_ISREG(alive_inode->i_mode)) {
++              iput(alive_inode);
++              printk(KERN_ERR "Alive (device %s): invalid alive inode\n",
++                      sb->s_id);
++              goto failed;
++      }
++      if (EXT3_I(alive_inode)->i_flags & EXT3_EXTENTS_FL) {
++              iput(alive_inode);
++              printk(KERN_ERR "Alive (device %s): invalid alive inode, "
++                      "in extents format\n", sb->s_id);
++              goto failed;
++      }
++
++      ei = EXT3_I(alive_inode);
++      alive_block = ei->i_data[0];
++      iput(alive_inode);
++
++      pr_debug("Alive (device %s): read in alive block #%u\n",
++                      sb->s_id, alive_block);
++
++      /* first read */
++      bh = sb_bread(sb, alive_block);
++      if (!bh) {
++              printk(KERN_ERR "Alive (device %s): "
++                      "can't read alive block #%u\n", sb->s_id, alive_block);
++              goto failed;
++      }
++
++      alive = (struct alive_struct *)(bh->b_data);
++      if (le32_to_cpu(alive->al_magic) != ALIVE_MAGIC) {
++              printk(KERN_ERR "Alive (device %s): "
++                      "magic mismatch\n", sb->s_id);
++              brelse(bh);
++              goto failed;
++      }
++
++      seq = le32_to_cpu(alive->al_seq);
++      pr_debug("Alive (device %s): seq=%u\n", sb->s_id, seq);
++      pr_info ("Alive (device %s): last touched by node: %s, "
++              "%li seconds ago\n", sb->s_id, alive->al_nodename,
++              get_seconds() - le32_to_cpu(alive->al_time));
++
++      if (seq == 0)
++              goto skip;
++
++      /* wait 8s */
++      pr_info("Alive (device %s): wait for 8 seconds...\n", sb->s_id);
++      schedule_timeout_uninterruptible(HZ * 8);
++
++      /* read again */
++      if (read_alive_again(bh)) {
++              printk(KERN_ERR "Alive (device %s): "
++                      "can't read alive block #%u\n",
++                      sb->s_id, alive_block);
++              goto failed;
++      }
++
++      alive = (struct alive_struct *)(bh->b_data);
++      pr_debug("Alive (device %s): seq=%u\n",
++              sb->s_id, le32_to_cpu(alive->al_seq));
++
++      if (seq != le32_to_cpu(alive->al_seq)) {
++              printk(KERN_WARNING "Alive (device %s): "
++                      "still active on node %s\n",
++                      sb->s_id, alive->al_nodename);
++              brelse(bh);
++              goto failed;
++      }
++skip:
++      /* write a new random seq */
++      get_random_bytes(&seq, sizeof(u32));
++      alive->al_seq = cpu_to_le32(seq);
++      if (unlikely(write_alive(bh))) {
++              printk(KERN_ERR "Alive (device %s): "
++                      "can't write alive block\n", sb->s_id);
++              goto failed;
++      }
++      pr_debug("Alive (device %s): write random seq=%u\n", sb->s_id, seq);
++
++      /* wait 6s */
++      pr_info("Alive (device %s): wait for 6 seconds...\n", sb->s_id);
++      schedule_timeout_uninterruptible(HZ * 6);
++
++      /* read again */
++      if (read_alive_again(bh)) {
++              printk(KERN_ERR "Alive (device %s): "
++                      "can't read alive block #%u\n",
++                      sb->s_id, alive_block);
++              goto failed;
++      }
++
++      alive = (struct alive_struct *)(bh->b_data);
++      pr_debug("Alive (device %s): seq=%u\n",
++              sb->s_id, le32_to_cpu(alive->al_seq));
++
++      if (seq != le32_to_cpu(alive->al_seq)) {
++              printk(KERN_WARNING "Alive (device %s): "
++                      "still active on node %s\n",
++                      sb->s_id, alive->al_nodename);
++              brelse(bh);
++              goto failed;
++      }
++
++      /* succeed */
++      pr_info("Alive (device %s): alive check passed!\n", sb->s_id);
++      sbi->s_alive_tsk = kthread_run(kalived, bh, "kalived");
++      return 0;
++
++failed:
++      printk(KERN_WARNING "Alive (device %s): alive check failed!\n",
++              sb->s_id);
++      return 1;
++}
++
+ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
+ {
+@@ -1688,6 +1950,10 @@ static int ext3_fill_super (struct super
+                         EXT3_HAS_INCOMPAT_FEATURE(sb,
+                                   EXT3_FEATURE_INCOMPAT_RECOVER));
++      if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_ALIVE))
++              if (check_alive(sb, sbi))
++                      goto failed_mount2;
++
+       /*
+        * The first inode we look at is the journal inode.  Don't try
+        * root first: it may be modified in the journal!
+@@ -1796,6 +2062,8 @@ failed_mount3:
+       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
++      if (sbi->s_alive_tsk)
++              kthread_stop(sbi->s_alive_tsk);
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+Index: mmp/include/linux/ext3_fs.h
+===================================================================
+--- mmp.orig/include/linux/ext3_fs.h   2006-07-18 20:43:51.000000000 +0800
++++ mmp/include/linux/ext3_fs.h        2006-07-18 20:43:52.000000000 +0800
+@@ -579,12 +579,14 @@ static inline struct ext3_inode_info *EX
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG         0x0010
+ #define EXT3_FEATURE_INCOMPAT_EXTENTS         0x0040 /* extents support */
++#define EXT3_FEATURE_INCOMPAT_ALIVE           0x0080
+ #define EXT3_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER| \
+                                        EXT3_FEATURE_INCOMPAT_META_BG| \
+-                                       EXT3_FEATURE_INCOMPAT_EXTENTS)
++                                       EXT3_FEATURE_INCOMPAT_EXTENTS| \
++                                       EXT3_FEATURE_INCOMPAT_ALIVE)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+Index: mmp/include/linux/ext3_fs_sb.h
+===================================================================
+--- mmp.orig/include/linux/ext3_fs_sb.h        2006-07-18 20:43:51.000000000 +0800
++++ mmp/include/linux/ext3_fs_sb.h     2006-07-18 20:43:52.000000000 +0800
+@@ -86,6 +86,7 @@ struct ext3_sb_info {
+       char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+       int s_jquota_fmt;                       /* Format of quota to use */
+ #endif
++      struct task_struct * s_alive_tsk;
+       /* for buddy allocator */
+       struct ext3_group_info **s_group_info;