Whamcloud - gitweb
* Removed the (new) tcp zero-copy patches
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-multi-mount-protection-2.6-fc5.patch
1 Index: mmp/fs/ext3/al.h
2 ===================================================================
3 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
4 +++ mmp/fs/ext3/al.h    2006-07-24 10:39:26.000000000 +0800
5 @@ -0,0 +1,11 @@
6 +/*
7 + * (C) 2006  Qi Yong <qiyong@clusterfs.com>
8 + */
9 +
10 +#define        ALIVE_MAGIC     0xA1153C29
11 +struct alive_struct {
12 +       __le32  al_magic;
13 +       __le32  al_seq;
14 +       __le32  al_time;
15 +       char    al_nodename[65];
16 +};
17 Index: mmp/fs/ext3/namei.c
18 ===================================================================
19 --- mmp.orig/fs/ext3/namei.c    2006-07-24 10:34:41.000000000 +0800
20 +++ mmp/fs/ext3/namei.c 2006-07-24 10:39:26.000000000 +0800
21 @@ -805,7 +805,7 @@ static inline int search_dirblock(struct
22   * The returned buffer_head has ->b_count elevated.  The caller is expected
23   * to brelse() it when appropriate.
24   */
25 -static struct buffer_head * ext3_find_entry (struct dentry *dentry,
26 +struct buffer_head * ext3_find_entry (struct dentry *dentry,
27                                         struct ext3_dir_entry_2 ** res_dir)
28  {
29         struct super_block * sb;
30 Index: mmp/fs/ext3/super.c
31 ===================================================================
32 --- mmp.orig/fs/ext3/super.c    2006-07-24 10:34:41.000000000 +0800
33 +++ mmp/fs/ext3/super.c 2006-07-24 10:45:19.000000000 +0800
34 @@ -36,12 +36,14 @@
35  #include <linux/namei.h>
36  #include <linux/quotaops.h>
37  #include <linux/seq_file.h>
38 +#include <linux/kthread.h>
39  
40  #include <asm/uaccess.h>
41  
42  #include "xattr.h"
43  #include "acl.h"
44  #include "namei.h"
45 +#include "al.h"
46  
47  static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48                              unsigned long journal_devnum);
49 @@ -62,6 +64,8 @@ static int ext3_statfs (struct super_blo
50  static void ext3_unlockfs(struct super_block *sb);
51  static void ext3_write_super (struct super_block * sb);
52  static void ext3_write_super_lockfs(struct super_block *sb);
53 +struct buffer_head * ext3_find_entry (struct dentry *dentry,
54 +                                       struct ext3_dir_entry_2 ** res_dir);
55  
56  /* 
57   * Wrappers for journal_start/end.
58 @@ -435,6 +439,9 @@ static void ext3_put_super (struct super
59                 invalidate_bdev(sbi->journal_bdev, 0);
60                 ext3_blkdev_remove(sbi);
61         }
62 +       if (sbi->s_alive_tsk)
63 +               kthread_stop(sbi->s_alive_tsk);
64 +
65         sb->s_fs_info = NULL;
66         kfree(sbi);
67         return;
68 @@ -1369,6 +1376,261 @@ static unsigned long descriptor_loc(stru
69         return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
70  }
71  
72 +static int write_alive(struct buffer_head * bh)
73 +{
74 +       lock_buffer(bh);
75 +       bh->b_end_io = end_buffer_write_sync;
76 +       get_bh(bh);
77 +       submit_bh(WRITE, bh);
78 +       wait_on_buffer(bh);
79 +       if (unlikely(!buffer_uptodate(bh)))
80 +               return 1;
81 +       return 0;
82 +}
83 +
84 +static int read_alive_again(struct buffer_head * bh)
85 +{
86 +       lock_buffer(bh);
87 +       bh->b_end_io = end_buffer_read_sync;
88 +       get_bh(bh);
89 +       submit_bh(READ, bh);
90 +       wait_on_buffer(bh);
91 +       if (!buffer_uptodate(bh)) {
92 +               brelse(bh);
93 +               return 1;
94 +       }
95 +       return 0;
96 +}
97 +
98 +/*
99 + * The caller must have a ref on the buffer_head.
100 + */
101 +static int kalived(void *data)
102 +{
103 +       struct buffer_head * bh;
104 +       struct alive_struct * alive;
105 +       char b[BDEVNAME_SIZE];
106 +       u32 seq = 0;
107 +
108 +       bh = (struct buffer_head *)data;
109 +       bdevname(bh->b_bdev, b);
110 +
111 +       alive = (struct alive_struct *)(bh->b_data);
112 +       alive->al_magic = cpu_to_le32(ALIVE_MAGIC);
113 +       alive->al_time = cpu_to_le32(get_seconds());
114 +
115 +       down_read(&uts_sem);
116 +       memcpy(alive->al_nodename, system_utsname.nodename, 65);
117 +       up_read(&uts_sem);
118 +
119 +       while (!kthread_should_stop()) {
120 +               if (++seq == 0)
121 +                       ++seq;
122 +
123 +               alive->al_seq = cpu_to_le32(seq);
124 +               alive->al_time = cpu_to_le32(get_seconds());
125 +
126 +               if (unlikely(write_alive(bh))) {
127 +                       /* panic here? */
128 +                       printk(KERN_ERR "Alive (device %s): "
129 +                               "can't write alive block\n", b);
130 +                       continue;
131 +               }
132 +
133 +               schedule_timeout_interruptible(5 * HZ);
134 +       }
135 +
136 +       alive->al_seq = 0;
137 +       alive->al_time = cpu_to_le32(get_seconds());
138 +
139 +       if (unlikely(write_alive(bh)))
140 +               printk(KERN_ERR "Alive (device %s): "
141 +                       "can't reset alive block\n", b);
142 +       brelse(bh);
143 +       return 0;
144 +}
145 +
146 +static unsigned long get_alive_ino(struct super_block *sb)
147 +{
148 +       unsigned long   ino = 0;
149 +       struct dentry   alive;
150 +       struct dentry   * root;
151 +       struct inode    * root_inode;
152 +       struct ext3_dir_entry_2 * de;
153 +       struct buffer_head      * bh;
154 +
155 +       root_inode = iget(sb, EXT3_ROOT_INO);
156 +       root = d_alloc_root(root_inode);
157 +       if (!root) {
158 +               printk(KERN_ERR "Alive (device %s): get root inode failed\n",
159 +                       sb->s_id);
160 +               iput(root_inode);
161 +               goto out;
162 +       }
163 +
164 +       alive.d_name.name = ".alive";
165 +       alive.d_name.len = 6;
166 +       alive.d_parent = root;
167 +
168 +       bh = ext3_find_entry(&alive, &de);
169 +       dput(root);
170 +
171 +       if (!bh) {
172 +               printk(KERN_WARNING "Alive (device %s): alive lookup failed\n",
173 +                       sb->s_id);
174 +               goto out;
175 +       }
176 +
177 +       ino = le32_to_cpu(de->inode);
178 +       brelse (bh);
179 +       pr_debug("Alive (device %s): alive_ino=%lu\n", sb->s_id, ino);
180 +out:
181 +       return ino;
182 +}
183 +
184 +/* check alive file */
185 +static int check_alive(struct super_block *sb, struct ext3_sb_info *sbi)
186 +{
187 +       unsigned long           ino;
188 +       struct buffer_head      * bh;
189 +       struct ext3_inode_info  * ei;
190 +       struct inode            * alive_inode;
191 +       struct alive_struct     * alive;
192 +       u32 alive_block;
193 +       u32 seq;
194 +
195 +       ino = get_alive_ino(sb);
196 +       if (!ino)
197 +               goto failed;
198 +
199 +       alive_inode = iget(sb, ino);
200 +       if (!alive_inode) {
201 +               iput(alive_inode);
202 +               printk(KERN_ERR "Alive (device %s): get alive inode failed\n",
203 +                       sb->s_id);
204 +               goto failed;
205 +       }
206 +       if (!alive_inode->i_nlink) {
207 +               make_bad_inode(alive_inode);
208 +               iput(alive_inode);
209 +               printk(KERN_ERR "Alive (device %s): alive inode is deleted\n",
210 +                       sb->s_id);
211 +               goto failed;
212 +       }
213 +       if (!S_ISREG(alive_inode->i_mode)) {
214 +               iput(alive_inode);
215 +               printk(KERN_ERR "Alive (device %s): invalid alive inode\n",
216 +                       sb->s_id);
217 +               goto failed;
218 +       }
219 +       if (EXT3_I(alive_inode)->i_flags & EXT3_EXTENTS_FL) {
220 +               iput(alive_inode);
221 +               printk(KERN_ERR "Alive (device %s): invalid alive inode, "
222 +                       "in extents format\n", sb->s_id);
223 +               goto failed;
224 +       }
225 +
226 +       ei = EXT3_I(alive_inode);
227 +       alive_block = ei->i_data[0];
228 +       iput(alive_inode);
229 +
230 +       pr_debug("Alive (device %s): read in alive block #%u\n",
231 +                       sb->s_id, alive_block);
232 +
233 +       /* first read */
234 +       bh = sb_bread(sb, alive_block);
235 +       if (!bh) {
236 +               printk(KERN_ERR "Alive (device %s): "
237 +                       "can't read alive block #%u\n", sb->s_id, alive_block);
238 +               goto failed;
239 +       }
240 +
241 +       alive = (struct alive_struct *)(bh->b_data);
242 +       if (le32_to_cpu(alive->al_magic) != ALIVE_MAGIC) {
243 +               printk(KERN_ERR "Alive (device %s): "
244 +                       "magic mismatch\n", sb->s_id);
245 +               brelse(bh);
246 +               goto failed;
247 +       }
248 +
249 +       seq = le32_to_cpu(alive->al_seq);
250 +       pr_debug("Alive (device %s): seq=%u\n", sb->s_id, seq);
251 +       pr_info ("Alive (device %s): last touched by node: %s, "
252 +               "%li seconds ago\n", sb->s_id, alive->al_nodename,
253 +               get_seconds() - le32_to_cpu(alive->al_time));
254 +
255 +       if (seq == 0)
256 +               goto skip;
257 +
258 +       /* wait 8s */
259 +       pr_info("Alive (device %s): wait for 8 seconds...\n", sb->s_id);
260 +       schedule_timeout_uninterruptible(HZ * 8);
261 +
262 +       /* read again */
263 +       if (read_alive_again(bh)) {
264 +               printk(KERN_ERR "Alive (device %s): "
265 +                       "can't read alive block #%u\n",
266 +                       sb->s_id, alive_block);
267 +               goto failed;
268 +       }
269 +
270 +       alive = (struct alive_struct *)(bh->b_data);
271 +       pr_debug("Alive (device %s): seq=%u\n",
272 +               sb->s_id, le32_to_cpu(alive->al_seq));
273 +
274 +       if (seq != le32_to_cpu(alive->al_seq)) {
275 +               printk(KERN_WARNING "Alive (device %s): "
276 +                       "still active on node %s\n",
277 +                       sb->s_id, alive->al_nodename);
278 +               brelse(bh);
279 +               goto failed;
280 +       }
281 +skip:
282 +       /* write a new random seq */
283 +       get_random_bytes(&seq, sizeof(u32));
284 +       alive->al_seq = cpu_to_le32(seq);
285 +       if (unlikely(write_alive(bh))) {
286 +               printk(KERN_ERR "Alive (device %s): "
287 +                       "can't write alive block\n", sb->s_id);
288 +               goto failed;
289 +       }
290 +       pr_debug("Alive (device %s): write random seq=%u\n", sb->s_id, seq);
291 +
292 +       /* wait 6s */
293 +       pr_info("Alive (device %s): wait for 6 seconds...\n", sb->s_id);
294 +       schedule_timeout_uninterruptible(HZ * 6);
295 +
296 +       /* read again */
297 +       if (read_alive_again(bh)) {
298 +               printk(KERN_ERR "Alive (device %s): "
299 +                       "can't read alive block #%u\n",
300 +                       sb->s_id, alive_block);
301 +               goto failed;
302 +       }
303 +
304 +       alive = (struct alive_struct *)(bh->b_data);
305 +       pr_debug("Alive (device %s): seq=%u\n",
306 +               sb->s_id, le32_to_cpu(alive->al_seq));
307 +
308 +       if (seq != le32_to_cpu(alive->al_seq)) {
309 +               printk(KERN_WARNING "Alive (device %s): "
310 +                       "still active on node %s\n",
311 +                       sb->s_id, alive->al_nodename);
312 +               brelse(bh);
313 +               goto failed;
314 +       }
315 +
316 +       /* succeed */
317 +       pr_info("Alive (device %s): alive check passed!\n", sb->s_id);
318 +       sbi->s_alive_tsk = kthread_run(kalived, bh, "kalived");
319 +       return 0;
320 +
321 +failed:
322 +       printk(KERN_WARNING "Alive (device %s): alive check failed!\n",
323 +               sb->s_id);
324 +       return 1;
325 +}
326 +
327  
328  static int ext3_fill_super (struct super_block *sb, void *data, int silent)
329  {
330 @@ -1668,6 +1930,10 @@ static int ext3_fill_super (struct super
331                           EXT3_HAS_INCOMPAT_FEATURE(sb,
332                                     EXT3_FEATURE_INCOMPAT_RECOVER));
333  
334 +       if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_ALIVE))
335 +               if (check_alive(sb, sbi))
336 +                       goto failed_mount2;
337 +
338         /*
339          * The first inode we look at is the journal inode.  Don't try
340          * root first: it may be modified in the journal!
341 @@ -1785,6 +2051,8 @@ cantfind_ext3:
342  
343  failed_mount3:
344         journal_destroy(sbi->s_journal);
345 +       if (sbi->s_alive_tsk)
346 +               kthread_stop(sbi->s_alive_tsk);
347  failed_mount2:
348         for (i = 0; i < db_count; i++)
349                 brelse(sbi->s_group_desc[i]);
350 Index: mmp/include/linux/ext3_fs.h
351 ===================================================================
352 --- mmp.orig/include/linux/ext3_fs.h    2006-07-24 10:34:41.000000000 +0800
353 +++ mmp/include/linux/ext3_fs.h 2006-07-24 10:39:26.000000000 +0800
354 @@ -581,12 +581,14 @@ static inline struct ext3_inode_info *EX
355  #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV      0x0008 /* Journal device */
356  #define EXT3_FEATURE_INCOMPAT_META_BG          0x0010
357  #define EXT3_FEATURE_INCOMPAT_EXTENTS          0x0040 /* extents support */
358 +#define EXT3_FEATURE_INCOMPAT_ALIVE            0x0080
359  
360  #define EXT3_FEATURE_COMPAT_SUPP       EXT2_FEATURE_COMPAT_EXT_ATTR
361  #define EXT3_FEATURE_INCOMPAT_SUPP     (EXT3_FEATURE_INCOMPAT_FILETYPE| \
362                                          EXT3_FEATURE_INCOMPAT_RECOVER| \
363                                          EXT3_FEATURE_INCOMPAT_META_BG| \
364 -                                        EXT3_FEATURE_INCOMPAT_EXTENTS)
365 +                                        EXT3_FEATURE_INCOMPAT_EXTENTS| \
366 +                                        EXT3_FEATURE_INCOMPAT_ALIVE)
367  #define EXT3_FEATURE_RO_COMPAT_SUPP    (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
368                                          EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
369                                          EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
370 Index: mmp/include/linux/ext3_fs_sb.h
371 ===================================================================
372 --- mmp.orig/include/linux/ext3_fs_sb.h 2006-07-24 10:34:41.000000000 +0800
373 +++ mmp/include/linux/ext3_fs_sb.h      2006-07-24 10:39:26.000000000 +0800
374 @@ -86,6 +86,7 @@ struct ext3_sb_info {
375         char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
376         int s_jquota_fmt;                       /* Format of quota to use */
377  #endif
378 +       struct task_struct * s_alive_tsk;
379  
380         /* for buddy allocator */
381         struct ext3_group_info **s_group_info;