Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -644,6 +647,8 @@
+@@ -644,6 +647,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -953,6 +958,12 @@
+@@ -953,6 +958,15 @@
case Opt_nobh:
set_opt(sbi->s_mount_opt, NOBH);
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
Opt_ignore, Opt_barrier,
Opt_err,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -582,6 +585,8 @@
+@@ -582,6 +585,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL}
};
-@@ -797,6 +802,12 @@
+@@ -797,6 +802,15 @@
break;
case Opt_ignore:
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -639,6 +644,8 @@
+@@ -639,6 +644,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -943,6 +950,12 @@
+@@ -943,6 +950,15 @@
match_int(&args[0], &option);
*n_blocks_count = option;
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
- unsigned long);
+ unsigned long, int);
-+extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long,
-+ unsigned long);
++extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long,
++ unsigned long);
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
/*
* third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ tid_t s_last_transaction;
+ int s_mb_factor;
+ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
+
+ /* history to debug policy */
+ struct ext3_mb_history *s_mb_history;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
#endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.5-7.252-full/fs/ext3/super.c
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -545,7 +546,7 @@ enum {
- Opt_ignore, Opt_barrier,
+@@ -545,6 +546,7 @@ enum {
Opt_err,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -591,6 +592,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -591,6 +592,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL}
};
-@@ -813,6 +815,9 @@ static int parse_options (char * options
+@@ -813,6 +815,19 @@ static int parse_options (char * options
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
+ break;
default:
printk (KERN_ERR
===================================================================
--- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400
+++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400
-@@ -0,0 +1,2616 @@
+@@ -0,0 +1,2703 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
-+
-+ if (max >= ac->ac_g_ex.fe_len) {
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
+ if (i >= ext3_mb_order2_reqs) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
/*
* third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ tid_t s_last_transaction;
+ int s_mb_factor;
+ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
+
+ /* history to debug policy */
+ struct ext3_mb_history *s_mb_history;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
#endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.12.6-bull/fs/ext3/super.c
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -597,7 +598,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -597,6 +598,7 @@ enum {
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -650,6 +651,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -650,6 +651,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -965,6 +967,9 @@ clear_qf_name:
+@@ -965,6 +967,19 @@ clear_qf_name:
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
+ break;
default:
printk (KERN_ERR
===================================================================
--- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400
+++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400
-@@ -0,0 +1,2615 @@
+@@ -0,0 +1,2702 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
-+
-+ if (max >= ac->ac_g_ex.fe_len) {
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
+ if (i >= ext3_mb_order2_reqs) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
-Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h
+Index: linux-stage/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400
+--- linux-stage.orig/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600
+@@ -57,6 +57,14 @@ struct statfs;
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -365,6 +373,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
+ unsigned long, unsigned long, int *);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
++extern void ext3_mb_release_blocks(struct super_block *, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-stage/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/include/linux/ext3_fs_sb.h 2006-05-25 10:59:14.000000000 -0600
@@ -23,9 +23,15 @@
#define EXT_INCLUDE
#include <linux/blockgroup_lock.h>
/*
* third extended-fs super-block data in memory
-@@ -81,6 +87,39 @@ struct ext3_sb_info {
+@@ -81,6 +87,43 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-2.6.9-full/include/linux/ext3_fs.h
-===================================================================
---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400
-@@ -57,6 +57,14 @@ struct statfs;
- #define ext3_debug(f, a...) do {} while (0)
- #endif
-
-+#define EXT3_MULTIBLOCK_ALLOCATOR 1
-+
-+#define EXT3_MB_HINT_MERGE 1
-+#define EXT3_MB_HINT_RESERVED 2
-+#define EXT3_MB_HINT_METADATA 4
-+#define EXT3_MB_HINT_FIRST 8
-+#define EXT3_MB_HINT_BEST 16
+
- /*
- * Special inodes numbers
- */
-@@ -365,6 +373,7 @@ struct ext3_inode {
- #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
- #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
- #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef clear_opt
-@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
- extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
- extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
- extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
-- unsigned long);
-+ unsigned long, int);
- extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
- unsigned long, unsigned long, int *);
- extern unsigned long ext3_count_free_blocks (struct super_block *);
-@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
- extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg);
-
-+/* mballoc.c */
-+extern long ext3_mb_stats;
-+extern long ext3_mb_max_to_scan;
-+extern int ext3_mb_init(struct super_block *, int);
-+extern int ext3_mb_release(struct super_block *);
-+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
-+extern int ext3_mb_reserve_blocks(struct super_block *, int);
-+extern void ext3_mb_release_blocks(struct super_block *, int);
-+int __init init_ext3_proc(void);
-+void exit_ext3_proc(void);
-+
- #endif /* __KERNEL__ */
-
- /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-2.6.9-full/fs/ext3/super.c
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-stage/fs/ext3/super.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400
-@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
+--- linux-stage.orig/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600
+@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -596,7 +597,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -597,6 +598,7 @@ enum {
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -648,6 +649,8 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -649,6 +651,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
+ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -958,6 +961,16 @@ clear_qf_name:
+@@ -962,6 +967,19 @@ static int parse_options (char * options
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
+ break;
+ case Opt_stripe:
+ if (match_int(&args[0], &option))
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super
+@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super
ext3_count_dirs(sb));
ext3_ext_init(sb);
return 0;
-@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t
+@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t
static int __init init_ext3_fs(void)
{
if (err)
return err;
err = init_inodecache();
-@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void)
+@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void)
unregister_filesystem(&ext3_fs_type);
destroy_inodecache();
exit_ext3_xattr();
}
int ext3_prep_san_write(struct inode *inode, long *blocks,
-Index: linux-2.6.9-full/fs/ext3/extents.c
+Index: linux-stage/fs/ext3/extents.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400
+--- linux-stage.orig/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600
@@ -777,7 +777,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.9-full/fs/ext3/Makefile
+Index: linux-stage/fs/ext3/inode.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400
-@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o
+--- linux-stage.orig/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o \
-- extents.o
-+ extents.o mballoc.o
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9-full/fs/ext3/xattr.c
+@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-stage/fs/ext3/balloc.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400
+--- linux-stage.orig/fs/ext3/balloc.c 2006-05-25 10:36:02.000000000 -0600
++++ linux-stage/fs/ext3/balloc.c 2006-05-25 10:36:04.000000000 -0600
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -451,24 +451,6 @@
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1131,7 +1113,7 @@
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-stage/fs/ext3/xattr.c
+===================================================================
+--- linux-stage.orig/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600
@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.9-full/fs/ext3/mballoc.c
+Index: linux-stage/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400
-+++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400
-@@ -0,0 +1,2671 @@
+--- linux-stage.orig/fs/ext3/mballoc.c 2006-05-23 17:33:37.579436680 -0600
++++ linux-stage/fs/ext3/mballoc.c 2006-05-25 10:59:14.000000000 -0600
+@@ -0,0 +1,2702 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
++ ac->ac_g_ex.fe_len, &ex);
+
+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ unsigned long start;
+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
-+ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
+ if (start % sbi->s_stripe == 0) {
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ * we try to find stripe-aligned chunks for stripe-size requests
+ */
+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+
+ if (*len == 1 && sbi->s_stripe) {
+ /* looks like a metadata, let's use a dirty hack for raid5
-+ * move all metadata in first groups in hope to hit cached
++ * move all metadata in first groups in hope to hit cached
+ * sectors and thus avoid read-modify cycles in raid5 */
+ ac.ac_g_ex.fe_group = group = 0;
+ }
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.9-full/fs/ext3/balloc.c
-===================================================================
---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300
-+++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400
-@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -451,24 +451,6 @@ error_return:
- return;
- }
-
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- unsigned long block, unsigned long count)
--{
-- struct super_block * sb;
-- int dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
- /*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
- {
- struct buffer_head *bitmap_bh = NULL;
-Index: linux-2.6.9-full/fs/ext3/inode.c
+Index: linux-stage/fs/ext3/Makefile
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400
-@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@ err_out:
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
- }
- }
+--- linux-stage.orig/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600
+@@ -6,7 +6,7 @@
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o \
+- extents.o
++ extents.o mballoc.o
- if (parent_bh) {
- /*
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c
+--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700
+@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to resize to %lu blocks safely\n",
++ sb->s_id, n_blocks_count);
++ if (sizeof(sector_t) < 8)
++ ext3_warning(sb, __FUNCTION__,
++ "CONFIG_LBD not enabled\n");
++ return -EINVAL;
++ }
++
+ if (n_blocks_count < o_blocks_count) {
+ ext3_warning(sb, __FUNCTION__,
+ "can't shrink FS - resize aborted");
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
+ goto cantfind_ext3;
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+_
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+_
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c
+--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700
+@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to resize to %lu blocks safely\n",
++ sb->s_id, n_blocks_count);
++ if (sizeof(sector_t) < 8)
++ ext3_warning(sb, __FUNCTION__,
++ "CONFIG_LBD not enabled\n");
++ return -EINVAL;
++ }
++
+ if (n_blocks_count < o_blocks_count) {
+ ext3_warning(sb, __FUNCTION__,
+ "can't shrink FS - resize aborted");
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+_
ext3-nlinks-2.6.9.patch
ext3-ialloc-2.6.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.9-rhel4.patch
ext3-htree-dot-2.6.5-suse.patch
ext3-ialloc-2.6.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.5-suse.patch
ext3-htree-dot-2.6.patch
ext3-external-journal-2.6.12.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.12.patch
tbd Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.4.7
+ * Support for kernels:
+ 2.6.9-34.EL (RHEL 4)
+ 2.6.5-7.252 (SLES 9)
+ 2.6.12.6 vanilla (kernel.org)
* bug fixes
-Severity : enhancement
-Bugzilla : 9292
-Description: Getattr by fid
-Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP
- locks, avoids extra getattr rpc requests to MDS, allows '/' to
- have locks and avoids getattr rpc requests for it on every stat.
-
Severity : major
Frequency : rare
Bugzilla : 5719, 9635, 9792, 9684,
Severity : minor
Frequency : Always
Bugzilla : 9486
-Description: extended inode attributes work improperly for the case of 2.4/2.6
- kernels used on client/server or the other way around.
+Description: extended inode attributes (immutable, append-only) work improperly
+ when 2.4 and 2.6 kernels are used on client/server or vice versa
Details : Introduce kernel-independent values for these flags.
+Severity : enhancement
+Frequency : Always
+Bugzilla : 10248
+Description: Allow fractional MB tunings for lustre in /proc/ filesystem.
+Details : Many of the /proc/ tunables can only be tuned at a megabyte
+ granularity. Now, Fractional MB granularity is be supported,
+ this is very useful for low memory system.
+
+Severity : enhancement
+Bugzilla : 9292
+Description: Getattr by fid
+Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP
+ locks, avoids extra getattr rpc requests to MDS, allows '/' to
+ have locks and avoids getattr rpc requests for it on every stat.
+
+Severity : major
+Frequency : Always, for filesystems larger than 2TB
+Bugzilla : 6191
+Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc
+Details : Kenrel kmalloc limits allocations to 128kB and this prevents
+ filesystems larger than 2TB to be mounted with mballoc enabled.
+
+Severity : critical
+Frequency : Always, for 32-bit kernel without CONFIG_LBD and filesystem > 2TB
+Bugzilla : 6191
+Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc
+Details : If a 32-bit kernel is compiled without CONFIG_LBD enabled and a
+ filesystems larger than 2TB is mounted then the kernel will
+ silently corrupt the start of the filesystem. CONFIG_LBD is
+ enabled for all CFS-supported kernels, but the possibility of
+ this happening with a modified kernel config exists.
+Severity : enhancement
+Bugzilla : 10462
+Description: add client O_DIRECT support for 2.6 kernels
+Details : It is now possible to do O_DIRECT reads and writes to files
+ in the Lustre client mountpoint on 2.6 kernel clients.
+
+Severity : enhancement
+Bugzilla : 10446
+Description: parallel glimpse, setattr, statfs, punch, destroy requests
+Details : Sends glimpse, setattr, statfs, punch, destroy requests to OSTs in
+ parallel, not waiting for response from every OST before sending
+ a rpc to the next OST.
------------------------------------------------------------------------------
02-14-2006 Cluster File Systems, Inc. <info@clusterfs.com>
this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052
for details.
* bug fixes
- * Support for newer kernels:
- 2.6.9-22.0.2.EL (RHEL 4),
- 2.6.5-7.244 (SLES 9) - same as 1.4.5.2.
+ * Support for kernels:
+ 2.6.9-22.0.2.EL (RHEL 4)
+ 2.6.5-7.244 (SLES 9)
2.6.12.6 vanilla (kernel.org)
echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver
cmp -s $(BUILD_VER_H) tmpver > tmpdiff 2> /dev/null && \
$(RM) tmpver tmpdiff || \
- mv tmpver $(BUILD_VER_H)
-
-CSTK=/tmp/checkstack
-CSTKO=/tmp/checkstack.orig
-
-checkstack:
- [ -f ${CSTK} -a ! -s ${CSTKO} ] && mv ${CSTK} ${CSTKO} || true
- for i in ${SUBDIRS} lnet/klnds/*; do \
- MOD=$$i/`basename $$i`.o; \
- [ -f $$MOD ] && objdump -d $$MOD | perl tests/checkstack.pl; \
- done | sort -nr > ${CSTK}
- [ -f ${CSTKO} ] && ! diff -u ${CSTKO} ${CSTK} || head -30 ${CSTK}
-
-checkstack-update:
- [ -f ${CSTK} ] && mv ${CSTK} ${CSTKO}
-
-checkstack-clean:
- rm -f ${CSTK} ${CSTKO}
+ mv -f tmpver $(BUILD_VER_H)
# llite/xattr.c
AC_CHECK_HEADERS([linux/xattr_acl.h])
+# utils/llverfs.c
+AC_CHECK_HEADERS([ext2fs/ext2fs.h])
+
# use universal lustre headers
# i.e: include/obd.h instead of include/linux/obd.h
AC_CHECK_FILE($PWD/lustre/include/obd.h, [AC_DEFINE(UNIV_LUSTRE_HEADERS, 1, [Use universal lustre headers])])
AM_CONDITIONAL(SERVER, test x$enable_server = xyes)
AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes)
AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes)
-AM_CONDITIONAL(EXT2FS, test x$ac_cv_header_ext2fs_ext2fs_h = xyes)
+AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes)
])
#
.B lfs setstripe <filename> <stripe-size> <start-ost> <stripe-cnt>
.br
.B lfs check <mds| osts| servers>
+.br
+.B lfs df [-i] [-h] [path]
.SH DESCRIPTION
.B lfs
can be used to create a new file with a specific striping pattern, determine the default striping pattern, gather the extended attributes (object numbers and
.B osts
List all the OSTs for the filesystem
.TP
+.B df
+Report filesystem disk space usage or inodes usage of each MDS/OSD.
+.TP
.B help
Provides brief help on the various arguments
.TP
.TP
.B $lfs osts
List all the OSTs
+.TP
+.B $lfs df -i
+Lists inode consumpton per OST and MDS
.SH BUGS
None are known.
\series bold
lfs\SpecialChar ~
quota [-o obd_uuid] [-u|-g] <name> <filesystem>
+\layout Standard
+
+\series bold
+lfs\SpecialChar ~
+df [-i] [-h] [path]
+\layout Standard
+
+\series bold
+lfs\SpecialChar ~
+help
\layout Subsection
DESCRIPTION
\layout List
\labelwidthstring 00.00.0000
+\series bold
+df
+\series default
+ Report filesystem disk space usage or inodes usage of each MDS/OSD.
+\layout List
+\labelwidthstring 00.00.0000
\series bold
help
Optional arguement to specify the journal size for the ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext3 filesystem will be configured with a journal size dependent upon how large the filesystem is.
.PP
.B --add mtpt
-Creates a mount-point on the specified node. Either an LOV or OSC name can be used.
+Creates a mount-point on the specified node for the given LOV.
.TP
--node node
Node that will use the mtpt.
\layout Description
--add\SpecialChar ~
-mtpt Creates a mount-point on the specified node.
- Either an LOV or OSC name can be used.
+mtpt Creates a mount-point on the specified node for the given LOV.
\begin_deeper
\layout Description
#define FSFILT_OP_JOIN 11
#define FSFILT_OP_NOOP 15
-#define fsfilt_check_slow(start, timeout, msg) \
+#define fsfilt_check_slow(obd, start, timeout, msg) \
do { \
if (time_before(jiffies, start + 15 * HZ)) \
break; \
else if (time_before(jiffies, start + 30 * HZ)) \
- CDEBUG(D_VFSTRACE,"slow %s %lus\n", msg,(jiffies-start)/HZ);\
+ CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \
+ msg, (jiffies-start) / HZ); \
else if (time_before(jiffies, start + timeout / 2 * HZ)) \
- CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ); \
+ CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \
+ (jiffies - start) / HZ); \
else \
- CERROR("slow %s %lus\n", msg, (jiffies - start) / HZ); \
+ CERROR("%s: slow %s %lus\n", obd->obd_name, msg, \
+ (jiffies - start) / HZ); \
} while (0)
static inline void *fsfilt_start_log(struct obd_device *obd,
LBUG();
}
}
- fsfilt_check_slow(now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, obd_timeout, "journal start");
return handle;
}
LBUG();
}
}
- fsfilt_check_slow(now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, obd_timeout, "journal start");
return handle;
}
int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
CDEBUG(D_INFO, "committing handle %p\n", handle);
- fsfilt_check_slow(now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, obd_timeout, "journal start");
return rc;
}
int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
- fsfilt_check_slow(now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, obd_timeout, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
CDEBUG(D_INFO, "waiting for completion %p\n", handle);
- fsfilt_check_slow(now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, obd_timeout, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc;
rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
- fsfilt_check_slow(now, obd_timeout, "setattr");
+ fsfilt_check_slow(obd, now, obd_timeout, "setattr");
return rc;
}
#endif
#endif
-#if (!defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \
- !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H)) && \
+#if !defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \
+ !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H) && \
!defined(_ASM_IA64_TYPES_H) && !defined(_X86_64_TYPES_H) && \
!defined(_PPC_TYPES_H) && !defined(_PPC64_TYPES_H)
/* yuck, would be nicer with _ASM_TYPES_H */
extern int lprocfs_write_helper(const char *buffer, unsigned long count,
int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+ int *val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+ long val, int mult);
extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
__u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+ __u64 *val, int mult);
int lprocfs_obd_seq_create(struct obd_device *dev, char *name, mode_t mode,
struct file_operations *seq_fops, void *data);
void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
void *data, int flag);
int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data);
-int ldlm_cli_enqueue(struct obd_export *exp,
- struct ptlrpc_request *req,
- struct ldlm_namespace *ns,
- struct ldlm_res_id,
- ldlm_type_t type,
- ldlm_policy_data_t *,
- ldlm_mode_t mode,
- int *flags,
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **req,
+ struct ldlm_res_id res_id, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode, int *flags,
ldlm_blocking_callback blocking,
ldlm_completion_callback completion,
ldlm_glimpse_callback glimpse,
- void *data,
- void *lvb,
- __u32 lvb_len,
- void *lvb_swabber,
- struct lustre_handle *lockh);
+ void *data, void *lvb, __u32 lvb_len, void *lvb_swabber,
+ struct lustre_handle *lockh, int async);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+ ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+ int *flags, void *lvb, __u32 lvb_len,
+ void *lvb_swabber, struct lustre_handle *lockh,
+ int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, struct ldlm_res_id res_id,
+ ldlm_type_t type, ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, int *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, __u32 lvb_len, void *lvb_swabber,
+ struct lustre_handle *lockh);
int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
void *data, __u32 data_len);
int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags);
int ldlm_cli_cancel(struct lustre_handle *lockh);
int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *,
int flags, void *opaque);
-int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *,
- int join);
+int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *, int join);
/* mds/handler.c */
/* This has to be here because recursive inclusion sucks. */
#define lsm_pattern lsm_wire.lw_pattern
#define lsm_stripe_count lsm_wire.lw_stripe_count
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(struct obd_info *oinfo, int rc);
+
+/* obd_enqueue parameters common for all levels (lov, osc). */
+struct obd_enqueue_info {
+ /* Flags used while lock handling. */
+ int ei_flags;
+ /* Type of the lock being enqueued. */
+ __u32 ei_type;
+ /* Mode of the lock being enqueued. */
+ __u32 ei_mode;
+ /* Different callbacks for lock handling (blocking, completion,
+ glimpse */
+ void *ei_cb_bl;
+ void *ei_cb_cp;
+ void *ei_cb_gl;
+ /* Data to be passed into callbacks. */
+ void *ei_cbdata;
+ /* Request set for OSC async requests. */
+ struct ptlrpc_request_set *ei_rqset;
+};
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+ /* Lock policy. It keeps an extent which is specific for a particular
+ * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+ * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+ ldlm_policy_data_t oi_policy;
+ /* Lock handle specific for every OSC lock. */
+ struct lustre_handle *oi_lockh;
+ /* lsm data specific for every OSC. */
+ struct lov_stripe_md *oi_md;
+ /* obdo data specific for every OSC, if needed at all. */
+ struct obdo *oi_oa;
+ /* statfs data specific for every OSC, if needed at all. */
+ struct obd_statfs *oi_osfs;
+ /* An update callback which is called to update some data on upper
+ * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+ * request in osc level for enqueue requests. It is also possible to
+ * update some caller data from LOV layer if needed. */
+ obd_enqueue_update_f oi_cb_up;
+};
+
/* compare all relevant fields. */
static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
struct lov_stripe_md *m2)
int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs,
cfs_time_t max_age);
+ int (*o_statfs_async)(struct obd_device *obd, struct obd_info *oinfo,
+ unsigned long max_age,
+ struct ptlrpc_request_set *set);
int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
struct lov_stripe_md *mem_src);
int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
int (*o_destroy)(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *ea, struct obd_trans_info *oti,
struct obd_export *md_exp);
- int (*o_setattr)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti);
- int (*o_setattr_async)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti);
- int (*o_getattr)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea);
- int (*o_getattr_async)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea,
+ int (*o_setattr)(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti);
+ int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset);
+ int (*o_getattr)(struct obd_export *exp, struct obd_info *oinfo);
+ int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
struct ptlrpc_request_set *set);
- int (*o_brw)(int rw, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, obd_count oa_bufs,
- struct brw_page *pgarr, struct obd_trans_info *oti);
- int (*o_brw_async)(int rw, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, obd_count oa_bufs,
- struct brw_page *pgarr, struct ptlrpc_request_set *,
- struct obd_trans_info *oti);
+ int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pgarr,
+ struct obd_trans_info *oti);
+ int (*o_brw_async)(int rw, struct obd_export *exp,
+ struct obd_info *oinfo, obd_count oa_bufs,
+ struct brw_page *pgarr, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *);
int (*o_prep_async_page)(struct obd_export *exp,
struct lov_stripe_md *lsm,
struct lov_oinfo *loi,
struct ost_lvb *lvb, int kms_only);
int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
obd_off size, int shrink);
- int (*o_punch)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, obd_size start,
- obd_size end, struct obd_trans_info *oti);
+ int (*o_punch)(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset);
int (*o_sync)(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *ea, obd_size start, obd_size end);
int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
int objcount, struct obd_ioobj *obj,
int niocount, struct niobuf_local *local,
struct obd_trans_info *oti, int rc);
- int (*o_enqueue)(struct obd_export *, struct lov_stripe_md *,
- __u32 type, ldlm_policy_data_t *, __u32 mode,
- int *flags, void *bl_cb, void *cp_cb, void *gl_cb,
- void *data, __u32 lvb_len, void *lvb_swabber,
- struct lustre_handle *lockh);
+ int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo);
int (*o_match)(struct obd_export *, struct lov_stripe_md *, __u32 type,
ldlm_policy_data_t *, __u32 mode, int *flags, void *data,
struct lustre_handle *lockh);
RETURN(rc);
}
-static inline int obd_getattr(struct obd_export *exp, struct obdo *obdo,
- struct lov_stripe_md *ea)
+static inline int obd_getattr(struct obd_export *exp, struct obd_info *oinfo)
{
int rc;
ENTRY;
EXP_CHECK_OP(exp, getattr);
OBD_COUNTER_INCREMENT(exp->exp_obd, getattr);
- rc = OBP(exp->exp_obd, getattr)(exp, obdo, ea);
+ rc = OBP(exp->exp_obd, getattr)(exp, oinfo);
RETURN(rc);
}
static inline int obd_getattr_async(struct obd_export *exp,
- struct obdo *obdo, struct lov_stripe_md *ea,
+ struct obd_info *oinfo,
struct ptlrpc_request_set *set)
{
int rc;
ENTRY;
- EXP_CHECK_OP(exp, getattr);
- OBD_COUNTER_INCREMENT(exp->exp_obd, getattr);
+ EXP_CHECK_OP(exp, getattr_async);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, getattr_async);
- rc = OBP(exp->exp_obd, getattr_async)(exp, obdo, ea, set);
+ rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
RETURN(rc);
}
-static inline int obd_setattr(struct obd_export *exp, struct obdo *obdo,
- struct lov_stripe_md *ea,
+static inline int obd_setattr(struct obd_export *exp, struct obd_info *oinfo,
struct obd_trans_info *oti)
{
int rc;
EXP_CHECK_OP(exp, setattr);
OBD_COUNTER_INCREMENT(exp->exp_obd, setattr);
- rc = OBP(exp->exp_obd, setattr)(exp, obdo, ea, oti);
+ rc = OBP(exp->exp_obd, setattr)(exp, oinfo, oti);
RETURN(rc);
}
-static inline int obd_setattr_async(struct obd_export *exp,
- struct obdo *obdo,
- struct lov_stripe_md *ea,
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
struct obd_trans_info *oti)
{
+ struct ptlrpc_request_set *set = NULL;
int rc;
ENTRY;
EXP_CHECK_OP(exp, setattr_async);
OBD_COUNTER_INCREMENT(exp->exp_obd, setattr_async);
- rc = OBP(exp->exp_obd, setattr_async)(exp, obdo, ea, oti);
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+ all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_OP(exp, setattr_async);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, setattr_async);
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
RETURN(rc);
}
/* @max_age is the oldest time in jiffies that we accept using a cached data.
* If the cache is older than @max_age we will get a new value from the
* target. Use a value of "jiffies + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_device *obd,
+ struct obd_info *oinfo,
+ unsigned long max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (obd == NULL)
+ RETURN(-EINVAL);
+
+ OBD_CHECK_OP(obd, statfs, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, statfs);
+
+ CDEBUG(D_SUPER, "osfs %lu, max_age %lu\n", obd->obd_osfs_age, max_age);
+ if (time_before(obd->obd_osfs_age, max_age)) {
+ rc = OBP(obd, statfs_async)(obd, oinfo, max_age, rqset);
+ } else {
+ CDEBUG(D_SUPER, "using cached obd_statfs data\n");
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+ spin_unlock(&obd->obd_osfs_lock);
+ if (oinfo->oi_cb_up)
+ oinfo->oi_cb_up(oinfo, 0);
+ }
+ RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_device *obd,
+ struct obd_statfs *osfs,
+ unsigned long max_age)
+{
+ struct ptlrpc_request_set *set = NULL;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc = 0;
+ ENTRY;
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ oinfo.oi_osfs = osfs;
+ rc = obd_statfs_async(obd, &oinfo, max_age, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of "jiffies + HZ" to guarantee freshness. */
static inline int obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
cfs_time_t max_age)
{
RETURN(rc);
}
-static inline int obd_punch(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, obd_size start,
- obd_size end, struct obd_trans_info *oti)
+static inline int obd_punch_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
{
+ struct ptlrpc_request_set *set = NULL;
int rc;
ENTRY;
EXP_CHECK_OP(exp, punch);
OBD_COUNTER_INCREMENT(exp->exp_obd, punch);
- rc = OBP(exp->exp_obd, punch)(exp, oa, ea, start, end, oti);
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, punch)(exp, oinfo, oti, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
RETURN(rc);
}
-static inline int obd_brw(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, obd_count oa_bufs,
+static inline int obd_punch(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_OP(exp, punch);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, punch);
+
+ rc = OBP(exp->exp_obd, punch)(exp, oinfo, oti, rqset);
+ RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+ struct obd_info *oinfo, obd_count oa_bufs,
struct brw_page *pg, struct obd_trans_info *oti)
{
int rc;
LBUG();
}
- rc = OBP(exp->exp_obd, brw)(cmd, exp, oa, ea, oa_bufs, pg, oti);
+ rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
RETURN(rc);
}
static inline int obd_brw_async(int cmd, struct obd_export *exp,
- struct obdo *oa, struct lov_stripe_md *ea,
- obd_count oa_bufs, struct brw_page *pg,
- struct ptlrpc_request_set *set,
- struct obd_trans_info *oti)
+ struct obd_info *oinfo, obd_count oa_bufs,
+ struct brw_page *pg, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
{
int rc;
ENTRY;
LBUG();
}
- rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oa, ea, oa_bufs, pg, set,
- oti);
+ rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oinfo, oa_bufs, pg,oti,set);
+ RETURN(rc);
+}
+
+static inline int obd_brw_rqset(int cmd, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md *lsm,
+ obd_count oa_bufs, struct brw_page *pg,
+ struct obd_trans_info *oti)
+{
+ struct ptlrpc_request_set *set = NULL;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc = 0;
+ ENTRY;
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ oinfo.oi_oa = oa;
+ oinfo.oi_md = lsm;
+ rc = obd_brw_async(cmd, exp, &oinfo, oa_bufs, pg, oti, set);
+ if (rc == 0) {
+ rc = ptlrpc_set_wait(set);
+ if (rc)
+ CERROR("error from callback: rc = %d\n", rc);
+ } else {
+ CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+ "error from obd_brw_async: rc = %d\n", rc);
+ }
+ ptlrpc_set_destroy(set);
RETURN(rc);
}
RETURN(rc);
}
-static inline int obd_enqueue(struct obd_export *exp, struct lov_stripe_md *ea,
- __u32 type, ldlm_policy_data_t *policy,
- __u32 mode, int *flags, void *bl_cb, void *cp_cb,
- void *gl_cb, void *data, __u32 lvb_len,
- void *lvb_swabber, struct lustre_handle *lockh)
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_OP(exp, enqueue);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, enqueue);
+
+ einfo->ei_rqset = ptlrpc_prep_set();
+ if (einfo->ei_rqset == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(einfo->ei_rqset);
+ ptlrpc_set_destroy(einfo->ei_rqset);
+ einfo->ei_rqset = NULL;
+
+ RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo)
{
int rc;
ENTRY;
EXP_CHECK_OP(exp, enqueue);
OBD_COUNTER_INCREMENT(exp->exp_obd, enqueue);
- rc = OBP(exp->exp_obd, enqueue)(exp, ea, type, policy, mode, flags,
- bl_cb, cp_cb, gl_cb, data, lvb_len,
- lvb_swabber, lockh);
+ rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo);
RETURN(rc);
}
struct list_head aa_oaps;
};
-struct osc_getattr_async_args {
- struct obdo *aa_oa;
+struct osc_async_args {
+ struct obd_info *aa_oi;
+};
+
+struct osc_enqueue_args {
+ struct obd_export *oa_exp;
+ struct obd_info *oa_oi;
+ struct obd_enqueue_info *oa_ei;
};
#endif
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -755,6 +756,10 @@
+@@ -755,6 +756,12 @@
return 0;
}
}
+ else if (!strcmp (this_char, "extents"))
+ set_opt (*mount_options, EXTENTS);
++ else if (!strcmp (this_char, "noextents"))
++ clear_opt (*mount_options, EXTENTS);
+ else if (!strcmp (this_char, "extdebug"))
+ set_opt (*mount_options, EXTDEBUG);
else if (!strcmp (this_char, "grpid") ||
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -733,6 +734,10 @@
+@@ -733,6 +734,12 @@
return 0;
}
}
+ else if (!strcmp (this_char, "extents"))
+ set_opt (*mount_options, EXTENTS);
++ else if (!strcmp (this_char, "noextents"))
++ clear_opt (*mount_options, EXTENTS);
+ else if (!strcmp (this_char, "extdebug"))
+ set_opt (*mount_options, EXTDEBUG);
else if (!strcmp (this_char, "grpid") ||
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -704,6 +705,10 @@
+@@ -704,6 +705,12 @@
return 0;
}
}
+ else if (!strcmp (this_char, "extents"))
+ set_opt (*mount_options, EXTENTS);
++ else if (!strcmp (this_char, "noextents"))
++ clear_opt (*mount_options, EXTENTS);
+ else if (!strcmp (this_char, "extdebug"))
+ set_opt (*mount_options, EXTDEBUG);
else if (!strcmp (this_char, "grpid") ||
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -702,6 +703,10 @@
+@@ -702,6 +703,12 @@
return 0;
}
}
+ else if (!strcmp (this_char, "extents"))
+ set_opt (*mount_options, EXTENTS);
++ else if (!strcmp (this_char, "noextents"))
++ clear_opt (*mount_options, EXTENTS);
+ else if (!strcmp (this_char, "extdebug"))
+ set_opt (*mount_options, EXTDEBUG);
else if (!strcmp (this_char, "grpid") ||
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -644,6 +647,8 @@
+@@ -644,6 +647,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -953,6 +958,12 @@
+@@ -953,6 +958,15 @@
case Opt_nobh:
set_opt(sbi->s_mount_opt, NOBH);
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
Opt_ignore, Opt_barrier,
Opt_err,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -582,6 +585,8 @@
+@@ -582,6 +585,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL}
};
-@@ -797,6 +802,12 @@
+@@ -797,6 +802,15 @@
break;
case Opt_ignore:
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
};
static match_table_t tokens = {
-@@ -639,6 +644,8 @@
+@@ -639,6 +644,9 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -943,6 +950,12 @@
+@@ -943,6 +950,15 @@
match_int(&args[0], &option);
*n_blocks_count = option;
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
- unsigned long);
+ unsigned long, int);
-+extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long,
-+ unsigned long);
++extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long,
++ unsigned long);
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
/*
* third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ tid_t s_last_transaction;
+ int s_mb_factor;
+ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
+
+ /* history to debug policy */
+ struct ext3_mb_history *s_mb_history;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
#endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.5-7.252-full/fs/ext3/super.c
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -545,7 +546,7 @@ enum {
- Opt_ignore, Opt_barrier,
+@@ -545,6 +546,7 @@ enum {
Opt_err,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -591,6 +592,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -591,6 +592,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL}
};
-@@ -813,6 +815,9 @@ static int parse_options (char * options
+@@ -813,6 +815,19 @@ static int parse_options (char * options
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
+ break;
default:
printk (KERN_ERR
===================================================================
--- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400
+++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400
-@@ -0,0 +1,2616 @@
+@@ -0,0 +1,2703 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
-+
-+ if (max >= ac->ac_g_ex.fe_len) {
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
+ if (i >= ext3_mb_order2_reqs) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
/*
* third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ tid_t s_last_transaction;
+ int s_mb_factor;
+ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
+
+ /* history to debug policy */
+ struct ext3_mb_history *s_mb_history;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
#endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.12.6-bull/fs/ext3/super.c
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -597,7 +598,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -597,6 +598,7 @@ enum {
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -650,6 +651,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -650,6 +651,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -965,6 +967,9 @@ clear_qf_name:
+@@ -965,6 +967,19 @@ clear_qf_name:
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
+ break;
default:
printk (KERN_ERR
===================================================================
--- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400
+++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400
-@@ -0,0 +1,2615 @@
+@@ -0,0 +1,2702 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
-+
-+ if (max >= ac->ac_g_ex.fe_len) {
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
+ if (i >= ext3_mb_order2_reqs) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
-Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h
+Index: linux-stage/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400
+--- linux-stage.orig/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600
+@@ -57,6 +57,14 @@ struct statfs;
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -365,6 +373,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
+ unsigned long, unsigned long, int *);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
++extern void ext3_mb_release_blocks(struct super_block *, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-stage/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/include/linux/ext3_fs_sb.h 2006-05-25 10:59:14.000000000 -0600
@@ -23,9 +23,15 @@
#define EXT_INCLUDE
#include <linux/blockgroup_lock.h>
/*
* third extended-fs super-block data in memory
-@@ -81,6 +87,39 @@ struct ext3_sb_info {
+@@ -81,6 +87,43 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+
+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
++ struct ext3_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
};
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-2.6.9-full/include/linux/ext3_fs.h
-===================================================================
---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400
-@@ -57,6 +57,14 @@ struct statfs;
- #define ext3_debug(f, a...) do {} while (0)
- #endif
-
-+#define EXT3_MULTIBLOCK_ALLOCATOR 1
-+
-+#define EXT3_MB_HINT_MERGE 1
-+#define EXT3_MB_HINT_RESERVED 2
-+#define EXT3_MB_HINT_METADATA 4
-+#define EXT3_MB_HINT_FIRST 8
-+#define EXT3_MB_HINT_BEST 16
+
- /*
- * Special inodes numbers
- */
-@@ -365,6 +373,7 @@ struct ext3_inode {
- #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
- #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
- #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef clear_opt
-@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
- extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
- extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
- extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
-- unsigned long);
-+ unsigned long, int);
- extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
- unsigned long, unsigned long, int *);
- extern unsigned long ext3_count_free_blocks (struct super_block *);
-@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
- extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg);
-
-+/* mballoc.c */
-+extern long ext3_mb_stats;
-+extern long ext3_mb_max_to_scan;
-+extern int ext3_mb_init(struct super_block *, int);
-+extern int ext3_mb_release(struct super_block *);
-+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
-+extern int ext3_mb_reserve_blocks(struct super_block *, int);
-+extern void ext3_mb_release_blocks(struct super_block *, int);
-+int __init init_ext3_proc(void);
-+void exit_ext3_proc(void);
-+
- #endif /* __KERNEL__ */
-
- /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-2.6.9-full/fs/ext3/super.c
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-stage/fs/ext3/super.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400
-@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
+--- linux-stage.orig/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600
+@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -596,7 +597,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -597,6 +598,7 @@ enum {
Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
};
static match_table_t tokens = {
-@@ -648,6 +649,8 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
+@@ -649,6 +651,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
+ {Opt_stripe, "stripe=%u"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -958,6 +961,16 @@ clear_qf_name:
+@@ -962,6 +967,19 @@ static int parse_options (char * options
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
++ set_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
+ break;
+ case Opt_stripe:
+ if (match_int(&args[0], &option))
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super
+@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super
ext3_count_dirs(sb));
ext3_ext_init(sb);
return 0;
-@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t
+@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t
static int __init init_ext3_fs(void)
{
if (err)
return err;
err = init_inodecache();
-@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void)
+@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void)
unregister_filesystem(&ext3_fs_type);
destroy_inodecache();
exit_ext3_xattr();
}
int ext3_prep_san_write(struct inode *inode, long *blocks,
-Index: linux-2.6.9-full/fs/ext3/extents.c
+Index: linux-stage/fs/ext3/extents.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400
+--- linux-stage.orig/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600
@@ -777,7 +777,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.9-full/fs/ext3/Makefile
+Index: linux-stage/fs/ext3/inode.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400
-@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o
+--- linux-stage.orig/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o \
-- extents.o
-+ extents.o mballoc.o
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9-full/fs/ext3/xattr.c
+@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-stage/fs/ext3/balloc.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400
+--- linux-stage.orig/fs/ext3/balloc.c 2006-05-25 10:36:02.000000000 -0600
++++ linux-stage/fs/ext3/balloc.c 2006-05-25 10:36:04.000000000 -0600
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -451,24 +451,6 @@
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1131,7 +1113,7 @@
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-stage/fs/ext3/xattr.c
+===================================================================
+--- linux-stage.orig/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600
@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.9-full/fs/ext3/mballoc.c
+Index: linux-stage/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400
-+++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400
-@@ -0,0 +1,2671 @@
+--- linux-stage.orig/fs/ext3/mballoc.c 2006-05-23 17:33:37.579436680 -0600
++++ linux-stage/fs/ext3/mballoc.c 2006-05-25 10:59:14.000000000 -0600
+@@ -0,0 +1,2702 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+ /* search goals */
+ struct ext3_free_extent ac_g_ex;
-+
++
+ /* the best found extent */
+ struct ext3_free_extent ac_b_ex;
-+
++
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ if (mb_check_counter++ % 300 != 0)
+ return;
+ }
-+
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ sb = inode->i_sb;
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
-+
++
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ memset(bh, 0, i);
+ } else
+ bh = &bhs;
-+
++
+ first_group = page->index * blocks_per_page / 2;
-+
++
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+ struct ext3_group_desc * desc;
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ EXT3_GROUP_INFO(sb, group));
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
-+
++
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
-+ ac->ac_g_ex.fe_len, &ex);
++ ac->ac_g_ex.fe_len, &ex);
+
+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ unsigned long start;
+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
-+ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
+ if (start % sbi->s_stripe == 0) {
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ * we try to find stripe-aligned chunks for stripe-size requests
+ */
+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+
+ if (*len == 1 && sbi->s_stripe) {
+ /* looks like a metadata, let's use a dirty hack for raid5
-+ * move all metadata in first groups in hope to hit cached
++ * move all metadata in first groups in hope to hit cached
+ * sectors and thus avoid read-modify cycles in raid5 */
+ ac.ac_g_ex.fe_group = group = 0;
+ }
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ }
+
+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
-+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = -EIO;
+ goto out_err;
+ }
-+
++
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
++
+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(*len, &sbi->s_bal_allocated);
+ s->max = sbi->s_mb_history_max;
+ s->start = sbi->s_mb_history_cur % s->max;
+ spin_unlock(&sbi->s_mb_history_lock);
-+
++
+ rc = seq_open(file, &ext3_mb_seq_history_ops);
+ if (rc == 0) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+
+static struct file_operations ext3_mb_seq_history_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = ext3_mb_seq_history_release,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
+};
+
+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext3_group_info);
+ ext3_lock_group(sb, group);
-+ memcpy(&sg, sbi->s_group_info[group], i);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
+ ext3_unlock_group(sb, group);
+
+ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
+
+static struct file_operations ext3_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
-+ .open = ext3_mb_seq_groups_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+};
+
+static void ext3_mb_history_release(struct super_block *sb)
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
-+ goto err_out;
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
-+ goto err_out;
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
++
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
-+
++ int i, num_meta_group_infos;
++
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
-+
++
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return;
+}
+
-+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
-+#define EXT3_MB_ORDER2_REQ "mb_order2_req"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.9-full/fs/ext3/balloc.c
-===================================================================
---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300
-+++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400
-@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -451,24 +451,6 @@ error_return:
- return;
- }
-
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- unsigned long block, unsigned long count)
--{
-- struct super_block * sb;
-- int dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
- /*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
- {
- struct buffer_head *bitmap_bh = NULL;
-Index: linux-2.6.9-full/fs/ext3/inode.c
+Index: linux-stage/fs/ext3/Makefile
===================================================================
---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400
-+++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400
-@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@ err_out:
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
- }
- }
+--- linux-stage.orig/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600
++++ linux-stage/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600
+@@ -6,7 +6,7 @@
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o \
+- extents.o
++ extents.o mballoc.o
- if (parent_bh) {
- /*
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,14 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (unsigned long)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n",
++ bdevname(sb->s_dev), le32_to_cpu(es->s_blocks_count));
++ goto failed_mount;
++ }
++
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+_
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c
+--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700
+@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to resize to %lu blocks safely\n",
++ sb->s_id, n_blocks_count);
++ if (sizeof(sector_t) < 8)
++ ext3_warning(sb, __FUNCTION__,
++ "CONFIG_LBD not enabled\n");
++ return -EINVAL;
++ }
++
+ if (n_blocks_count < o_blocks_count) {
+ ext3_warning(sb, __FUNCTION__,
+ "can't shrink FS - resize aborted");
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
+ goto cantfind_ext3;
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+_
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+_
--- /dev/null
+Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem
+From: Mingming Cao <cmm@us.ibm.com>
+
+
+If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
+CONFIG_LBD not defined in the kernel), the calculation of the disk sector
+will overflow. Add check at ext3_fill_super() and ext3_group_extend() to
+prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
+bytes.
+
+Verified this patch on a 32 bit platform without CONFIG_LBD defined
+(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.
+
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Acked-by: Andreas Dilger <adilger@clusterfs.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ fs/ext3/resize.c | 10 ++++++++++
+ fs/ext3/super.c | 10 ++++++++++
+ 2 files changed, 20 insertions(+)
+
+diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c
+--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700
+@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to resize to %lu blocks safely\n",
++ sb->s_id, n_blocks_count);
++ if (sizeof(sector_t) < 8)
++ ext3_warning(sb, __FUNCTION__,
++ "CONFIG_LBD not enabled\n");
++ return -EINVAL;
++ }
++
+ if (n_blocks_count < o_blocks_count) {
+ ext3_warning(sb, __FUNCTION__,
+ "can't shrink FS - resize aborted");
+diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c
+--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700
++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700
+@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super
+ goto failed_mount;
+ }
+
++ if (le32_to_cpu(es->s_blocks_count) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ printk(KERN_ERR "EXT3-fs: filesystem on %s: "
++ "too large to mount safely - %u blocks\n", sb->s_id,
++ le32_to_cpu(es->s_blocks_count));
++ if (sizeof(sector_t) < 8)
++ printk(KERN_WARNING
++ "EXT3-fs: CONFIG_LBD not enabled\n");
++ goto failed_mount;
++ }
++
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+_
ext3-extents-asyncdel-2.4.24.patch
ext3-nlinks-2.4.20-hp_pnnl.patch
export-zap-page-range.patch
+ext3-sector_t-overflow-2.4.patch
ext3-nlinks-2.6.9.patch
ext3-ialloc-2.6.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.9-rhel4.patch
ext3-htree-dot-2.6.5-suse.patch
ext3-ialloc-2.6.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.5-suse.patch
ext3-htree-dot-2.6.patch
ext3-external-journal-2.6.12.patch
ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.12.patch
nfsd_iallocsem.patch
linux-2.4.24-jbd-handle-EIO-rhel3.patch
ext3-lookup-dotdot-2.4.20.patch
+ext3-sector_t-overflow-2.4.patch
nfsd_iallocsem.patch
linux-2.4.24-jbd-handle-EIO.patch
ext3-ialloc-2.4.21-suse2.patch
+ext3-sector_t-overflow-2.4.patch
+++ /dev/null
-configurable-x86-stack-2.4.21-suse-171.patch
-configurable-x86_64-2.4.21.patch
-dev_read_only_2.4.20-rh.patch
-exports_2.4.20-rh-hp.patch
-lustre_version.patch
-vfs_intent-2.4.21-suse-171.patch
-invalidate_show.patch
-export-truncate.patch
-iod-stock-24-exports_hp.patch
-ext3-htree-2.4.21-chaos.patch
-linux-2.4.21-xattr-0.8.54-suse-171.patch
-ext3-orphan_lock-2.4.22-rh.patch
-ext3-noread-2.4.21-suse2.patch
-ext3-delete_thread-2.4.21-suse-171.patch
-extN-wantedi-2.4.21-suse2.patch
-ext3-san-2.4.20.patch
-ext3-map_inode_page-2.4.21-suse2.patch
-ext3-error-export.patch
-iopen-2.4.21-chaos.patch
-jbd-dont-account-blocks-twice.patch
-jbd-commit-tricks.patch
-ext3-no-write-super-chaos.patch
-add_page_private.patch
-nfs_export_kernel-2.4.21-suse2.patch
-ext3-raw-lookup.patch
-ext3-ea-in-inode-2.4.21-sles.patch
-listman-2.4.20.patch
-ext3-truncate-buffer-head.patch
-lookup-stack-symbols-2.4.21-suse-171.patch
-fsprivate-2.4-suse.patch
-nfsd_iallocsem.patch
fsprivate-2.4.patch
nfsd_iallocsem.patch
linux-2.4.24-jbd-handle-EIO.patch
+ext3-sector_t-overflow-2.4.patch
fsprivate-2.4.patch
nfsd_iallocsem.patch
linux-2.4.24-jbd-handle-EIO.patch
+ext3-sector_t-overflow-2.4.patch
kallsyms-2.4.29.patch
fsprivate-2.4.patch
nfsd_iallocsem.patch
+ext3-sector_t-overflow-2.4.patch
UNSUPPORTED KERNELS; BEING PHASED OUT; MAY BE MISSING CRITICAL BUG FIXES:
hp-pnnl-2.4.20 linux-2.4.20-hp4_pnnl1 same as vanilla but no uml ia64
vanilla-2.4.24 linux-2.4.24 patch with uml-2.4.24-6 um
-suse-2.4.21-jvn linux-2.4.21-241 sles8 2.4 kernel i386
return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
}
+/* timeout for initial callback (AST) reply */
+static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout, unsigned int obd_timeout)
+{
+ return max(min(ldlm_timeout, obd_timeout / 3), 1U);
+}
+
#ifdef __KERNEL__
/* w_l_spinlock protects both waiting_locks_list and expired_lock_thread */
static spinlock_t waiting_locks_spinlock;
l_unlock(&lock->l_resource->lr_namespace->ns_lock);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
+ req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */
if (unlikely(instant_cancel)) {
rc = ptl_send_rpc(req, 1);
} else {
ptlrpc_req_set_repsize(req, 1, NULL);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
+ req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */
/* We only send real blocking ASTs after the lock is granted */
l_lock(&lock->l_resource->lr_namespace->ns_lock);
ptlrpc_req_set_repsize(req, 2, size);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
+ req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */
rc = ptlrpc_queue_wait(req);
if (rc == -ELDLM_NO_LOCK_DATA)
EXPORT_SYMBOL(ldlm_expired_completion_wait);
EXPORT_SYMBOL(ldlm_cli_convert);
EXPORT_SYMBOL(ldlm_cli_enqueue);
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
EXPORT_SYMBOL(ldlm_cli_cancel);
EXPORT_SYMBOL(ldlm_cli_cancel_unused);
EXPORT_SYMBOL(ldlm_cli_join_lru);
return -ELDLM_NO_LOCK_DATA;
}
-static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
- struct ldlm_res_id res_id,
- __u32 type,
- ldlm_policy_data_t *policy,
- ldlm_mode_t mode,
- int *flags,
- ldlm_blocking_callback blocking,
- ldlm_completion_callback completion,
- ldlm_glimpse_callback glimpse,
- void *data, __u32 lvb_len,
- void *lvb_swabber,
- struct lustre_handle *lockh)
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, struct ldlm_res_id res_id,
+ ldlm_type_t type, ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, int *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, __u32 lvb_len, void *lvb_swabber,
+ struct lustre_handle *lockh)
{
struct ldlm_lock *lock;
int err;
ENTRY;
+ LASSERT(!(*flags & LDLM_FL_REPLAY));
if (ns->ns_client) {
CERROR("Trying to enqueue local lock in a shadow namespace\n");
LBUG();
}
}
-int ldlm_cli_enqueue(struct obd_export *exp,
- struct ptlrpc_request *req,
- struct ldlm_namespace *ns,
- struct ldlm_res_id res_id,
- __u32 type,
- ldlm_policy_data_t *policy,
- ldlm_mode_t mode,
- int *flags,
- ldlm_blocking_callback blocking,
- ldlm_completion_callback completion,
- ldlm_glimpse_callback glimpse,
- void *data,
- void *lvb,
- __u32 lvb_len,
- void *lvb_swabber,
- struct lustre_handle *lockh)
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+ ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+ int *flags, void *lvb, __u32 lvb_len,
+ void *lvb_swabber, struct lustre_handle *lockh,int rc)
{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ int is_replay = *flags & LDLM_FL_REPLAY;
struct ldlm_lock *lock;
- struct ldlm_request *body;
struct ldlm_reply *reply;
- int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
- [DLM_LOCKREQ_OFF] = sizeof(*body),
- [DLM_REPLY_REC_OFF] = lvb_len };
- int is_replay = *flags & LDLM_FL_REPLAY;
- int req_passed_in = 1, cleanup_phase = 0, rc;
+ int cleanup_phase = 1;
ENTRY;
- if (exp == NULL) {
- LASSERT(!is_replay);
- rc = ldlm_cli_enqueue_local(ns, res_id, type, policy, mode,
- flags, blocking, completion,
- glimpse, data, lvb_len, lvb_swabber,
- lockh);
- RETURN(rc);
- }
-
- /* If we're replaying this lock, just check some invariants.
- * If we're creating a new lock, get everything all setup nice. */
- if (is_replay) {
- lock = ldlm_handle2lock(lockh);
- LDLM_DEBUG(lock, "client-side enqueue START");
- LASSERT(exp == lock->l_conn_export);
- } else {
- lock = ldlm_lock_create(ns, NULL, res_id, type, mode, blocking,
- completion, glimpse, data, lvb_len);
- if (lock == NULL)
- RETURN(-ENOMEM);
- /* for the local lock, add the reference */
- ldlm_lock_addref_internal(lock, mode);
- ldlm_lock2handle(lock, lockh);
- lock->l_lvb_swabber = lvb_swabber;
- if (policy != NULL) {
- /* INODEBITS_INTEROP: If the server does not support
- * inodebits, we will request a plain lock in the
- * descriptor (ldlm_lock2desc() below) but use an
- * inodebits lock internally with both bits set.
- */
- if (type == LDLM_IBITS && !(exp->exp_connect_flags &
- OBD_CONNECT_IBITS))
- lock->l_policy_data.l_inodebits.bits =
- MDS_INODELOCK_LOOKUP |
- MDS_INODELOCK_UPDATE;
- else
- lock->l_policy_data = *policy;
- }
-
- if (type == LDLM_EXTENT)
- lock->l_req_extent = policy->l_extent;
- LDLM_DEBUG(lock, "client-side enqueue START");
- }
-
- /* lock not sent to server yet */
- cleanup_phase = 2;
-
- if (req == NULL) {
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
- LDLM_ENQUEUE, 2, size, NULL);
- if (req == NULL)
- GOTO(cleanup, rc = -ENOMEM);
- req_passed_in = 0;
- } else {
- LASSERTF(lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF) ==
- sizeof(*body), "buflen[%d] = %d, not %d\n",
- DLM_LOCKREQ_OFF,
- lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF),
- sizeof(*body));
- }
-
- lock->l_conn_export = exp;
- lock->l_export = NULL;
- lock->l_blocking_ast = blocking;
-
- /* Dump lock data into the request buffer */
- body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF, sizeof(*body));
- ldlm_lock2desc(lock, &body->lock_desc);
- body->lock_flags = *flags;
- body->lock_handle1 = *lockh;
-
- /* Continue as normal. */
- if (!req_passed_in) {
- size[DLM_LOCKREPLY_OFF] = sizeof(*reply);
- ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size);
-
- }
- LDLM_DEBUG(lock, "sending request");
- rc = ptlrpc_queue_wait(req);
-
+ lock = ldlm_handle2lock(lockh);
+ /* ldlm_cli_enqueue is holding a reference on this lock. */
+ LASSERT(lock != NULL);
if (rc != ELDLM_OK) {
LASSERT(!is_replay);
LDLM_DEBUG(lock, "client-side enqueue END (%s)",
GOTO(cleanup, rc);
}
- /*
- * Liblustre client doesn't get extent locks, except for O_APPEND case
- * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
- * [i_size, OBD_OBJECT_EOF] lock is taken.
- */
- LASSERT(ergo(LIBLUSTRE_CLIENT, type != LDLM_EXTENT ||
- policy->l_extent.end == OBD_OBJECT_EOF));
-
reply = lustre_swab_repbuf(req, DLM_LOCKREPLY_OFF, sizeof(*reply),
lustre_swab_ldlm_reply);
if (reply == NULL) {
}
/* lock enqueued on the server */
- cleanup_phase = 1;
+ cleanup_phase = 0;
l_lock(&ns->ns_lock);
lock->l_remote_handle = reply->lock_handle;
}
LDLM_DEBUG(lock, "client-side enqueue, new resource");
}
- if (policy != NULL)
+ if (with_policy)
if (!(type == LDLM_IBITS && !(exp->exp_connect_flags &
OBD_CONNECT_IBITS)))
lock->l_policy_data =
if (!rc)
rc = err;
if (rc)
- cleanup_phase = 2;
+ cleanup_phase = 1;
}
}
LDLM_DEBUG(lock, "client-side enqueue END");
EXIT;
cleanup:
- switch (cleanup_phase) {
- case 2:
- if (rc)
+ if (cleanup_phase == 1 && rc)
+ failed_lock_cleanup(ns, lock, lockh, mode);
+ /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+ LDLM_LOCK_PUT(lock);
+ LDLM_LOCK_PUT(lock);
+ return rc;
+}
+
+/* If a request has some specific initialisation it is passed in @reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass @async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in @reqp. */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+ struct ldlm_res_id res_id, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode, int *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, void *lvb, __u32 lvb_len, void *lvb_swabber,
+ struct lustre_handle *lockh, int async)
+{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ struct ldlm_lock *lock;
+ struct ldlm_request *body;
+ struct ldlm_reply *reply;
+ int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+ [DLM_LOCKREQ_OFF] = sizeof(*body),
+ [DLM_REPLY_REC_OFF] = lvb_len };
+ int is_replay = *flags & LDLM_FL_REPLAY;
+ int req_passed_in = 1, rc;
+ struct ptlrpc_request *req;
+ ENTRY;
+
+ LASSERT(exp != NULL);
+
+ /* If we're replaying this lock, just check some invariants.
+ * If we're creating a new lock, get everything all setup nice. */
+ if (is_replay) {
+ lock = ldlm_handle2lock(lockh);
+ LDLM_DEBUG(lock, "client-side enqueue START");
+ LASSERT(exp == lock->l_conn_export);
+ } else {
+ lock = ldlm_lock_create(ns, NULL, res_id, type, mode, blocking,
+ completion, glimpse, data, lvb_len);
+ if (lock == NULL)
+ RETURN(-ENOMEM);
+ /* for the local lock, add the reference */
+ ldlm_lock_addref_internal(lock, mode);
+ ldlm_lock2handle(lock, lockh);
+ lock->l_lvb_swabber = lvb_swabber;
+ if (policy != NULL) {
+ /* INODEBITS_INTEROP: If the server does not support
+ * inodebits, we will request a plain lock in the
+ * descriptor (ldlm_lock2desc() below) but use an
+ * inodebits lock internally with both bits set.
+ */
+ if (type == LDLM_IBITS && !(exp->exp_connect_flags &
+ OBD_CONNECT_IBITS))
+ lock->l_policy_data.l_inodebits.bits =
+ MDS_INODELOCK_LOOKUP |
+ MDS_INODELOCK_UPDATE;
+ else
+ lock->l_policy_data = *policy;
+ }
+
+ if (type == LDLM_EXTENT)
+ lock->l_req_extent = policy->l_extent;
+ LDLM_DEBUG(lock, "client-side enqueue START");
+ }
+
+ /* lock not sent to server yet */
+
+ if (reqp == NULL || *reqp == NULL) {
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE, 2, size, NULL);
+ if (req == NULL) {
failed_lock_cleanup(ns, lock, lockh, mode);
- case 1:
- if (!req_passed_in && req != NULL)
- ptlrpc_req_finished(req);
+ LDLM_LOCK_PUT(lock);
+ RETURN(-ENOMEM);
+ }
+ req_passed_in = 0;
+ if (reqp)
+ *reqp = req;
+ } else {
+ req = *reqp;
+ LASSERTF(lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF) ==
+ sizeof(*body), "buflen[%d] = %d, not %d\n",
+ DLM_LOCKREQ_OFF,
+ lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF),
+ sizeof(*body));
}
- LDLM_LOCK_PUT(lock);
- return rc;
+ lock->l_conn_export = exp;
+ lock->l_export = NULL;
+ lock->l_blocking_ast = blocking;
+
+ /* Dump lock data into the request buffer */
+ body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF, sizeof(*body));
+ ldlm_lock2desc(lock, &body->lock_desc);
+ body->lock_flags = *flags;
+ body->lock_handle1 = *lockh;
+
+ /* Continue as normal. */
+ if (!req_passed_in) {
+ size[DLM_LOCKREPLY_OFF] = sizeof(*reply);
+ ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size);
+ }
+
+ /*
+ * Liblustre client doesn't get extent locks, except for O_APPEND case
+ * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+ * [i_size, OBD_OBJECT_EOF] lock is taken.
+ */
+ LASSERT(ergo(LIBLUSTRE_CLIENT, type != LDLM_EXTENT ||
+ policy->l_extent.end == OBD_OBJECT_EOF));
+
+ if (async) {
+ LASSERT(reqp != NULL);
+ RETURN(0);
+ }
+
+ LDLM_DEBUG(lock, "sending request");
+ rc = ptlrpc_queue_wait(req);
+ rc = ldlm_cli_enqueue_fini(exp, req, type, policy ? 1 : 0,
+ mode, flags, lvb, lvb_len, lvb_swabber,
+ lockh, rc);
+
+ if (!req_passed_in && req != NULL) {
+ ptlrpc_req_finished(req);
+ if (reqp)
+ *reqp = NULL;
+ }
+
+ RETURN(rc);
}
static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
{
struct llu_inode_info *lli = llu_i2info(inode);
struct intnl_stat *st = llu_i2stat(inode);
- struct lov_stripe_md *lsm = lli->lli_smd;
- struct obdo oa = {0};
+ struct obd_info oinfo = { { { 0 } } };
+ struct obdo oa = { 0 };
int rc;
ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:inode=%llu/%lu(%p) to %llu\n",
(long long)st->st_ino, lli->lli_st_generation, inode,
(long long)st->st_size);
- if (!lsm) {
+ if (!lli->lli_smd) {
CDEBUG(D_INODE, "truncate on inode %llu with no objects\n",
(long long)st->st_ino);
EXIT;
return;
}
- oa.o_id = lsm->lsm_object_id;
+ oinfo.oi_md = lli->lli_smd;
+ oinfo.oi_policy.l_extent.start = st->st_size;
+ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+ oinfo.oi_oa = &oa;
+ oa.o_id = lli->lli_smd->lsm_object_id;
oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
oa.o_flags = flags; /* We don't actually want to copy inode flags */
-
+
obdo_from_inode(&oa, inode,
OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
OBD_MD_FLMTIME | OBD_MD_FLCTIME);
- obd_adjust_kms(llu_i2obdexp(inode), lsm, st->st_size, 1);
+ obd_adjust_kms(llu_i2obdexp(inode), lli->lli_smd, st->st_size, 1);
CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n",
oa.o_id, (long long)st->st_size);
/* truncate == punch from new size to absolute end of file */
- rc = obd_punch(llu_i2obdexp(inode), &oa, lsm, st->st_size,
- OBD_OBJECT_EOF, NULL);
+ rc = obd_punch_rqset(llu_i2obdexp(inode), &oinfo, NULL);
if (rc)
CERROR("obd_truncate fails (%d) ino %llu\n",
rc, (long long)st->st_ino);
struct llu_inode_info *lli = llu_i2info(inode);
struct intnl_stat *st = llu_i2stat(inode);
struct llu_sb_info *sbi = llu_i2sbi(inode);
- ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
struct lustre_handle lockh = { 0 };
+ struct obd_enqueue_info einfo = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
struct ost_lvb lvb;
- int rc, flags = LDLM_FL_HAS_INTENT;
+ int rc;
ENTRY;
CDEBUG(D_DLMTRACE, "Glimpsing inode %llu\n", (long long)st->st_ino);
- rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy,
- LCK_PR, &flags, llu_extent_lock_callback,
- ldlm_completion_ast, llu_glimpse_callback, inode,
- sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
+ einfo.ei_type = LDLM_EXTENT;
+ einfo.ei_mode = LCK_PR;
+ einfo.ei_flags = LDLM_FL_HAS_INTENT;
+ einfo.ei_cb_bl = llu_extent_lock_callback;
+ einfo.ei_cb_cp = ldlm_completion_ast;
+ einfo.ei_cb_gl = llu_glimpse_callback;
+ einfo.ei_cbdata = inode;
+
+ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+ oinfo.oi_lockh = &lockh;
+ oinfo.oi_md = lli->lli_smd;
+
+ rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
if (rc) {
CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
RETURN(rc > 0 ? -EIO : rc);
CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
(long long)st->st_size, (long long)st->st_blocks);
- obd_cancel(sbi->ll_osc_exp, lli->lli_smd, LCK_PR, &lockh);
-
RETURN(rc);
}
{
struct llu_sb_info *sbi = llu_i2sbi(inode);
struct intnl_stat *st = llu_i2stat(inode);
+ struct obd_enqueue_info einfo = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
struct ost_lvb lvb;
int rc;
ENTRY;
(long long)st->st_ino, policy->l_extent.start,
policy->l_extent.end);
- rc = obd_enqueue(sbi->ll_osc_exp, lsm, LDLM_EXTENT, policy, mode,
- &ast_flags, llu_extent_lock_callback,
- ldlm_completion_ast, llu_glimpse_callback, inode,
- sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh);
+ einfo.ei_type = LDLM_EXTENT;
+ einfo.ei_mode = mode;
+ einfo.ei_flags = ast_flags;
+ einfo.ei_cb_bl = llu_extent_lock_callback;
+ einfo.ei_cb_cp = ldlm_completion_ast;
+ einfo.ei_cb_gl = llu_glimpse_callback;
+ einfo.ei_cbdata = inode;
+
+ oinfo.oi_policy = *policy;
+ oinfo.oi_lockh = lockh;
+ oinfo.oi_md = lsm;
+
+ rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo);
+ *policy = oinfo.oi_policy;
if (rc > 0)
rc = -EIO;
struct llu_inode_info *lli = llu_i2info(inode);
struct obd_export *exp = llu_i2obdexp(inode);
struct ptlrpc_request_set *set;
- struct obdo oa;
+ struct obd_info oinfo = { { { 0 } } };
+ struct obdo oa = { 0 };
obd_flag refresh_valid;
int rc;
ENTRY;
LASSERT(lsm);
LASSERT(lli);
- memset(&oa, 0, sizeof oa);
+ oinfo.oi_md = lsm;
+ oinfo.oi_oa = &oa;
oa.o_id = lsm->lsm_object_id;
oa.o_mode = S_IFREG;
oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
CERROR ("ENOMEM allocing request set\n");
rc = -ENOMEM;
} else {
- rc = obd_getattr_async(exp, &oa, lsm, set);
+ rc = obd_getattr_async(exp, &oinfo, set);
if (rc == 0)
rc = ptlrpc_set_wait(set);
ptlrpc_set_destroy(set);
rc = err;
}
} else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
+ struct obd_info oinfo = { { { 0 } } };
struct obdo oa;
CDEBUG(D_INODE, "set mtime on OST inode %llu to %lu\n",
(long long)st->st_ino, LTIME_S(attr->ia_mtime));
oa.o_id = lsm->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
+
obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
OBD_MD_FLMTIME | OBD_MD_FLCTIME);
- rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL);
+
+ oinfo.oi_oa = &oa;
+ oinfo.oi_md = lsm;
+
+ rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
if (rc)
- CERROR("obd_setattr fails: rc=%d\n", rc);
+ CERROR("obd_setattr_async fails: rc=%d\n", rc);
}
RETURN(rc);
}
CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
- rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age);
+ rc = obd_statfs_rqset(class_exp2obd(sbi->ll_osc_exp),
+ &obd_statfs, max_age);
if (rc) {
CERROR("obd_statfs fails: rc = %d\n", rc);
RETURN(rc);
int cmd,
struct file_lock *file_lock)
{
- struct obd_device *obddev;
struct llu_inode_info *lli = llu_i2info(ino);
struct intnl_stat *st = llu_i2stat(ino);
struct ldlm_res_id res_id =
"start="LPU64", end="LPU64"\n", st->st_ino, flock.l_flock.pid,
flags, mode, flock.l_flock.start, flock.l_flock.end);
- obddev = llu_i2mdcexp(ino)->exp_obd;
- rc = ldlm_cli_enqueue(llu_i2mdcexp(ino), NULL, obddev->obd_namespace,
- res_id, LDLM_FLOCK, &flock, mode, &flags,
- NULL, ldlm_flock_completion_ast, NULL, file_lock,
- NULL, 0, NULL, &lockh);
+ rc = ldlm_cli_enqueue(llu_i2mdcexp(ino), NULL, res_id,
+ LDLM_FLOCK, &flock, mode, &flags, NULL,
+ ldlm_flock_completion_ast, NULL,
+ file_lock, NULL, 0, NULL, &lockh, 0);
RETURN(rc);
}
// error = "inode out of bounds";
bad_entry:
CERROR("%s: bad entry in directory %lu/%u: %s - "
- "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d",
- ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
- dir->i_generation, error, (page->index<<PAGE_CACHE_SHIFT), offs,
- (unsigned long)le32_to_cpu(p->inode),
- rec_len, p->name_len);
+ "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d\n",
+ ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
+ dir->i_generation, error, page->index << PAGE_CACHE_SHIFT, offs,
+ (unsigned long)le32_to_cpu(p->inode),
+ rec_len, p->name_len);
goto fail;
Eend:
p = (ext2_dirent *)(kaddr + offs);
- CERROR("ext2_check_page"
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
- (unsigned long) le32_to_cpu(p->inode));
+ CERROR("%s: entry in directory %lu/%u spans the page boundary "
+ "offset=%lu+%u, inode=%lu\n",ll_i2mdcexp(dir)->exp_obd->obd_name,
+ dir->i_ino, dir->i_generation, page->index << PAGE_CACHE_SHIFT,
+ offs, (unsigned long)le32_to_cpu(p->inode));
fail:
SetPageChecked(page);
SetPageError(page);
struct lov_stripe_md *lsm;
struct lov_user_md_join *lmj;
int lmj_size, i, aindex = 0, rc;
-
+
rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
if (rc < 0)
GOTO(out_req, rc = -ENOMEM);
out_free_memmd:
obd_free_memmd(sbi->ll_osc_exp, &lsm);
if (rc)
- GOTO(out_req, rc);
+ GOTO(out_lmm, rc);
}
if (cmd == IOC_MDC_GETFILEINFO) {
struct lov_user_mds_data *lmdp;
lmdp = (struct lov_user_mds_data *)arg;
rc = copy_to_user(&lmdp->lmd_st, &st, sizeof(st));
if (rc)
- GOTO(out_req, rc = -EFAULT);
+ GOTO(out_lmm, rc = -EFAULT);
lump = &lmdp->lmd_lmm;
} else {
lump = (struct lov_user_md *)arg;
}
rc = copy_to_user(lump, lmm, lmmsize);
- if (lmm->lmm_magic == LOV_MAGIC_JOIN)
- OBD_FREE(lmm, lmmsize);
if (rc)
- GOTO(out_req, rc = -EFAULT);
+ GOTO(out_lmm, rc = -EFAULT);
EXIT;
+ out_lmm:
+ if (lmm->lmm_magic == LOV_MAGIC_JOIN)
+ OBD_FREE(lmm, lmmsize);
out_req:
ptlrpc_req_finished(request);
out_name:
struct obdo *oa)
{
struct ptlrpc_request_set *set;
+ struct obd_info oinfo = { { { 0 } } };
int rc;
ENTRY;
LASSERT(lsm != NULL);
memset(oa, 0, sizeof *oa);
+ oinfo.oi_md = lsm;
+ oinfo.oi_oa = oa;
oa->o_id = lsm->lsm_object_id;
oa->o_mode = S_IFREG;
oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
if (set == NULL) {
rc = -ENOMEM;
} else {
- rc = obd_getattr_async(exp, oa, lsm, set);
+ rc = obd_getattr_async(exp, &oinfo, set);
if (rc == 0)
rc = ptlrpc_set_wait(set);
ptlrpc_set_destroy(set);
{
struct ll_inode_info *lli = ll_i2info(inode);
struct ll_sb_info *sbi = ll_i2sbi(inode);
- ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
struct lustre_handle lockh = { 0 };
+ struct obd_enqueue_info einfo = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
struct ost_lvb lvb;
int rc;
ENTRY;
RETURN(0);
}
- ast_flags |= LDLM_FL_HAS_INTENT;
-
/* NOTE: this looks like DLM lock request, but it may not be one. Due
* to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
* won't revoke any conflicting DLM locks held. Instead,
* holding a DLM lock against this file, and resulting size
* will be returned for each stripe. DLM lock on [0, EOF] is
* acquired only if there were no conflicting locks. */
- rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy,
- LCK_PR, &ast_flags, ll_extent_lock_callback,
- ldlm_completion_ast, ll_glimpse_callback, inode,
- sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
+ einfo.ei_type = LDLM_EXTENT;
+ einfo.ei_mode = LCK_PR;
+ einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
+ einfo.ei_cb_bl = ll_extent_lock_callback;
+ einfo.ei_cb_cp = ldlm_completion_ast;
+ einfo.ei_cb_gl = ll_glimpse_callback;
+ einfo.ei_cbdata = inode;
+
+ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+ oinfo.oi_lockh = &lockh;
+ oinfo.oi_md = lli->lli_smd;
+
+ rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
if (rc == -ENOENT)
RETURN(rc);
if (rc != 0) {
CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
inode->i_size, inode->i_blocks);
- obd_cancel(sbi->ll_osc_exp, lli->lli_smd, LCK_PR, &lockh);
-
RETURN(rc);
}
{
struct ll_sb_info *sbi = ll_i2sbi(inode);
struct ost_lvb lvb;
+ struct obd_enqueue_info einfo = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
int rc;
ENTRY;
CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
inode->i_ino, policy->l_extent.start, policy->l_extent.end);
- rc = obd_enqueue(sbi->ll_osc_exp, lsm, LDLM_EXTENT, policy, mode,
- &ast_flags, ll_extent_lock_callback,
- ldlm_completion_ast, ll_glimpse_callback, inode,
- sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh);
+ einfo.ei_type = LDLM_EXTENT;
+ einfo.ei_mode = mode;
+ einfo.ei_flags = ast_flags;
+ einfo.ei_cb_bl = ll_extent_lock_callback;
+ einfo.ei_cb_cp = ldlm_completion_ast;
+ einfo.ei_cb_gl = ll_glimpse_callback;
+ einfo.ei_cbdata = inode;
+
+ oinfo.oi_policy = *policy;
+ oinfo.oi_lockh = lockh;
+ oinfo.oi_md = lsm;
+
+ rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo);
+ *policy = oinfo.oi_policy;
if (rc > 0)
rc = -EIO;
{
struct inode *inode = file->f_dentry->d_inode;
struct ll_sb_info *sbi = ll_i2sbi(inode);
- struct obd_device *obddev;
struct ldlm_res_id res_id =
{ .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
struct lustre_handle lockh = {0};
"start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
flags, mode, flock.l_flock.start, flock.l_flock.end);
- obddev = sbi->ll_mdc_exp->exp_obd;
- rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, obddev->obd_namespace,
- res_id, LDLM_FLOCK, &flock, mode, &flags,
- NULL, ldlm_flock_completion_ast, NULL, file_lock,
- NULL, 0, NULL, &lockh);
+ rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, res_id,
+ LDLM_FLOCK, &flock, mode, &flags, NULL,
+ ldlm_flock_completion_ast, NULL, file_lock,
+ NULL, 0, NULL, &lockh, 0);
RETURN(rc);
}
}
} else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
obd_flag flags;
+ struct obd_info oinfo = { { { 0 } } };
struct obdo oa;
CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
inode->i_ino, LTIME_S(attr->ia_mtime));
-
+
oa.o_id = lsm->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
OBD_MD_FLMTIME | OBD_MD_FLCTIME |
OBD_MD_FLFID | OBD_MD_FLGENER;
-
+
obdo_from_inode(&oa, inode, flags);
- rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL);
+
+ oinfo.oi_oa = &oa;
+ oinfo.oi_md = lsm;
+
+ rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
if (rc)
- CERROR("obd_setattr fails: rc=%d\n", rc);
+ CERROR("obd_setattr_async fails: rc=%d\n", rc);
}
RETURN(rc);
}
CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
- rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age);
+ rc = obd_statfs_rqset(class_exp2obd(sbi->ll_osc_exp),
+ &obd_osfs, max_age);
if (rc) {
CERROR("obd_statfs fails: rc = %d\n", rc);
RETURN(rc);
}
case EXT3_IOC_SETFLAGS: {
struct mdc_op_data op_data;
- struct ll_iattr_struct attr;
- struct obdo *oa;
+ struct ll_iattr_struct attr = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
if (get_user(flags, (int *)arg))
RETURN(-EFAULT);
- oa = obdo_alloc();
- if (!oa)
+ oinfo.oi_md = lsm;
+ oinfo.oi_oa = obdo_alloc();
+ if (!oinfo.oi_oa)
RETURN(-ENOMEM);
ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
- memset(&attr, 0x0, sizeof(attr));
attr.ia_attr_flags = flags;
((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG;
rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
(struct iattr *)&attr, NULL, 0, NULL, 0, &req);
+ ptlrpc_req_finished(req);
if (rc || lsm == NULL) {
- ptlrpc_req_finished(req);
- obdo_free(oa);
+ obdo_free(oinfo.oi_oa);
RETURN(rc);
}
- ptlrpc_req_finished(req);
- oa->o_id = lsm->lsm_object_id;
- oa->o_flags = flags;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
+ oinfo.oi_oa->o_id = lsm->lsm_object_id;
+ oinfo.oi_oa->o_flags = flags;
+ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
- obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
- rc = obd_setattr(sbi->ll_osc_exp, oa, lsm, NULL);
- obdo_free(oa);
+ obdo_from_inode(oinfo.oi_oa, inode,
+ OBD_MD_FLFID | OBD_MD_FLGENER);
+ rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
+ obdo_free(oinfo.oi_oa);
if (rc) {
if (rc != -EPERM && rc != -EACCES)
- CERROR("mdc_setattr fails: rc = %d\n", rc);
+ CERROR("mdc_setattr_async fails: rc = %d\n", rc);
RETURN(rc);
}
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- unsigned val;
+ long pages_number;
+ int mult;
spin_lock(&sbi->ll_lock);
- val = sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT);
+ pages_number = sbi->ll_ra_info.ra_max_pages;
spin_unlock(&sbi->ll_lock);
- return snprintf(page, count, "%u\n", val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_read_frac_helper(page, count, pages_number, mult);
}
static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- int val, rc;
+ int mult, rc, pages_number;
- rc = lprocfs_write_helper(buffer, count, &val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
if (rc)
return rc;
- if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) {
+ if (pages_number < 0 || pages_number > num_physpages / 2) {
CERROR("can't set file readahead more than %lu MB\n",
- num_physpages >> (20 - PAGE_CACHE_SHIFT - 1));
+ num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
return -ERANGE;
}
spin_lock(&sbi->ll_lock);
- sbi->ll_ra_info.ra_max_pages = val << (20 - PAGE_CACHE_SHIFT);
+ sbi->ll_ra_info.ra_max_pages = pages_number;
spin_unlock(&sbi->ll_lock);
return count;
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- unsigned val;
+ long pages_number;
+ int mult;
spin_lock(&sbi->ll_lock);
- val = sbi->ll_ra_info.ra_max_read_ahead_whole_pages >>
- (20 - PAGE_CACHE_SHIFT);
+ pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
spin_unlock(&sbi->ll_lock);
- return snprintf(page, count, "%u\n", val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_read_frac_helper(page, count, pages_number, mult);
}
static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- int val, rc;
+ int mult, rc, pages_number;
- rc = lprocfs_write_helper(buffer, count, &val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
if (rc)
return rc;
/* Cap this at the current max readahead window size, the readahead
* algorithm does this anyway so it's pointless to set it larger. */
- if (val < 0 ||
- val > (sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT))) {
+ if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
CERROR("can't set max_read_ahead_whole_mb more than "
"max_read_ahead_mb: %lu\n",
sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT));
}
spin_lock(&sbi->ll_lock);
- sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
- val << (20 - PAGE_CACHE_SHIFT);
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
spin_unlock(&sbi->ll_lock);
return count;
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- unsigned val;
+ long pages_number;
+ int mult;
spin_lock(&sbi->ll_lock);
- val = sbi->ll_async_page_max >> (20 - PAGE_CACHE_SHIFT);
+ pages_number = sbi->ll_async_page_max;
spin_unlock(&sbi->ll_lock);
- return snprintf(page, count, "%u\n", val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_read_frac_helper(page, count, pages_number, mult);;
}
static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
{
struct super_block *sb = data;
struct ll_sb_info *sbi = ll_s2sbi(sb);
- int val, rc;
+ int mult, rc, pages_number;
- rc = lprocfs_write_helper(buffer, count, &val);
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
if (rc)
return rc;
- if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT))) {
+ if (pages_number < 0 || pages_number > num_physpages) {
CERROR("can't set max cache more than %lu MB\n",
num_physpages >> (20 - PAGE_CACHE_SHIFT));
return -ERANGE;
}
spin_lock(&sbi->ll_lock);
- sbi->ll_async_page_max = val << (20 - PAGE_CACHE_SHIFT);
+ sbi->ll_async_page_max = pages_number ;
spin_unlock(&sbi->ll_lock);
if (sbi->ll_async_page_count >= sbi->ll_async_page_max)
{
struct ll_inode_info *lli = ll_i2info(inode);
struct lov_stripe_md *lsm = lli->lli_smd;
+ struct obd_info oinfo = { { { 0 } } };
struct brw_page pg;
int rc;
ENTRY;
else
lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
LPROC_LL_BRW_READ, pg.count);
- rc = obd_brw(cmd, ll_i2obdexp(inode), oa, lsm, 1, &pg, NULL);
+ oinfo.oi_oa = oa;
+ oinfo.oi_md = lsm;
+ rc = obd_brw(cmd, ll_i2obdexp(inode), &oinfo, 1, &pg, NULL);
if (rc == 0)
obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
else if (rc != -EIO)
void ll_truncate(struct inode *inode)
{
struct ll_inode_info *lli = ll_i2info(inode);
- struct lov_stripe_md *lsm = lli->lli_smd;
+ struct obd_info oinfo = { { { 0 } } };
struct ost_lvb lvb;
struct obdo oa;
int rc;
return;
}
- if (!lsm) {
+ if (!lli->lli_smd) {
CDEBUG(D_INODE, "truncate on inode %lu with no objects\n",
inode->i_ino);
GOTO(out_unlock, 0);
/* XXX I'm pretty sure this is a hack to paper over a more fundamental
* race condition. */
- lov_stripe_lock(lsm);
+ lov_stripe_lock(lli->lli_smd);
inode_init_lvb(inode, &lvb);
- obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 0);
+ obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
if (lvb.lvb_size == inode->i_size) {
CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
- lsm->lsm_object_id, inode->i_size, inode->i_size);
- lov_stripe_unlock(lsm);
+ lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size);
+ lov_stripe_unlock(lli->lli_smd);
GOTO(out_unlock, 0);
}
- obd_adjust_kms(ll_i2obdexp(inode), lsm, inode->i_size, 1);
- lov_stripe_unlock(lsm);
+ obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd, inode->i_size, 1);
+ lov_stripe_unlock(lli->lli_smd);
if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_CHECKSUM) &&
(inode->i_size & ~PAGE_MASK))) {
}
CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n",
- lsm->lsm_object_id, inode->i_size, inode->i_size);
+ lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size);
- oa.o_id = lsm->lsm_object_id;
+ oinfo.oi_md = lli->lli_smd;
+ oinfo.oi_policy.l_extent.start = inode->i_size;
+ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+ oinfo.oi_oa = &oa;
+ oa.o_id = lli->lli_smd->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
ll_inode_size_unlock(inode, 0);
- rc = obd_punch(ll_i2obdexp(inode), &oa, lsm, inode->i_size,
- OBD_OBJECT_EOF, NULL);
+ rc = obd_punch_rqset(ll_i2obdexp(inode), &oinfo, NULL);
if (rc)
CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
else
struct ll_inode_info *lli = ll_i2info(inode);
struct lov_stripe_md *lsm = lli->lli_smd;
obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
+ struct obd_info oinfo = { { { 0 } } };
struct brw_page pga;
struct obdo oa;
struct ost_lvb lvb;
oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
- rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oa, lsm,
- 1, &pga, NULL);
+ oinfo.oi_oa = &oa;
+ oinfo.oi_md = lsm;
+ rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oinfo, 1, &pga, NULL);
if (rc)
RETURN(rc);
struct ll_inode_info *lli = ll_i2info(inode);
struct lov_stripe_md *lsm = lli->lli_smd;
struct brw_page *pga;
- struct ptlrpc_request_set *set;
struct obdo oa;
int length, i, flags, rc = 0;
loff_t offset;
(iobuf->length & (PAGE_SIZE - 1)))
RETURN(-EINVAL);
- set = ptlrpc_prep_set();
- if (set == NULL)
- RETURN(-ENOMEM);
-
OBD_ALLOC(pga, sizeof(*pga) * iobuf->nr_pages);
- if (!pga) {
- ptlrpc_set_destroy(set);
+ if (!pga)
RETURN(-ENOMEM);
- }
flags = 0 /* | OBD_BRW_DIRECTIO */;
offset = ((obd_off)blocknr << inode->i_blkbits);
else
lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
LPROC_LL_DIRECT_READ, iobuf->length);
- rc = obd_brw_async(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages,
- pga, set, NULL);
- if (rc) {
- CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
- "error from obd_brw_async: rc = %d\n", rc);
- } else {
- rc = ptlrpc_set_wait(set);
- if (rc)
- CERROR("error from callback: rc = %d\n", rc);
- }
- ptlrpc_set_destroy(set);
+ rc = obd_brw_rqset(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages,
+ pga, NULL);
if (rc == 0) {
rc = iobuf->length;
if (rw == OBD_BRW_WRITE) {
return 1;
}
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+ size_t size, struct page ***pages)
+{
+ int result = -ENOMEM;
+ unsigned long page_count;
+
+ /* set an arbitrary limit to prevent arithmetic overflow */
+ if (size > MAX_DIRECTIO_SIZE) {
+ *pages = NULL;
+ return -EFBIG;
+ }
+
+ page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ page_count -= user_addr >> PAGE_SHIFT;
+
+ OBD_ALLOC_GFP(*pages, page_count * sizeof(**pages), GFP_KERNEL);
+ if (*pages) {
+ down_read(¤t->mm->mmap_sem);
+ result = get_user_pages(current, current->mm, user_addr,
+ page_count, (rw == READ), 0, *pages,
+ NULL);
+ up_read(¤t->mm->mmap_sem);
+ }
+
+ return result;
+}
+
+/* ll_free_user_pages - tear down page struct array
+ * @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ if (do_dirty)
+ set_page_dirty_lock(pages[i]);
+ page_cache_release(pages[i]);
+ }
+
+ OBD_FREE(pages, npages * sizeof(*pages));
+}
+
+static ssize_t ll_direct_IO_26_seg(int rw, struct file *file,
+ struct address_space *mapping,
+ struct inode *inode,
+ struct lov_stripe_md *lsm,
+ unsigned long user_addr, size_t size,
+ loff_t file_offset, struct page **pages,
+ int page_count)
+{
+ struct brw_page *pga;
+ struct obdo oa;
+ int i, rc = 0;
+ size_t length;
+ ENTRY;
+
+ OBD_ALLOC(pga, sizeof(*pga) * page_count);
+ if (!pga)
+ RETURN(-ENOMEM);
+
+ for (i = 0, length = size; length > 0;
+ length -=pga[i].count, file_offset +=pga[i].count,i++) {/*i last!*/
+ pga[i].pg = pages[i];
+ pga[i].off = file_offset;
+ /* To the end of the page, or the length, whatever is less */
+ pga[i].count = min_t(int, PAGE_SIZE -(file_offset & ~PAGE_MASK),
+ length);
+ pga[i].flag = 0;
+ if (rw == READ)
+ POISON_PAGE(pages[i], 0x0d);
+ }
+
+ ll_inode_fill_obdo(inode, rw, &oa);
+
+ if (rw == WRITE)
+ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
+ LPROC_LL_DIRECT_WRITE, size);
+ else
+ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
+ LPROC_LL_DIRECT_READ, size);
+ rc = obd_brw_rqset(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
+ ll_i2obdexp(inode), &oa, lsm, page_count, pga, NULL);
+ if (rc == 0) {
+ rc = size;
+ if (rw == WRITE) {
+ lov_stripe_lock(lsm);
+ obd_adjust_kms(ll_i2obdexp(inode), lsm, file_offset, 0);
+ lov_stripe_unlock(lsm);
+ }
+ }
+
+ OBD_FREE(pga, sizeof(*pga) * page_count);
+ RETURN(rc);
+}
+
+static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t file_offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t count = iov_length(iov, nr_segs), tot_bytes = 0;
+ struct ll_inode_info *lli = ll_i2info(file->f_mapping->host);
+ unsigned long seg = 0;
+ ENTRY;
+
+ if (!lli->lli_smd || !lli->lli_smd->lsm_object_id)
+ RETURN(-EBADF);
+
+ /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+ if ((file_offset & (PAGE_SIZE - 1)) || (count & (PAGE_SIZE - 1)))
+ RETURN(-EINVAL);
+
+ while ((seg < nr_segs) && (tot_bytes >= 0)) {
+ const struct iovec *vec = &iov[seg++];
+ unsigned long user_addr = (unsigned long)vec->iov_base;
+ size_t size = vec->iov_len;
+ struct page **pages;
+ int page_count;
+ ssize_t result;
+
+ page_count = ll_get_user_pages(rw, user_addr, size, &pages);
+ if (page_count < 0) {
+ ll_free_user_pages(pages, 0, 0);
+ if (tot_bytes > 0)
+ break;
+ return page_count;
+ }
+
+ result = ll_direct_IO_26_seg(rw, file, file->f_mapping,
+ file->f_mapping->host,
+ lli->lli_smd, user_addr, size,
+ file_offset, pages, page_count);
+ ll_free_user_pages(pages, page_count, rw == READ);
+
+ if (result <= 0) {
+ if (tot_bytes > 0)
+ break;
+ return result;
+ }
+
+ tot_bytes += result;
+ file_offset += result;
+ if (result < size)
+ break;
+ }
+ return tot_bytes;
+}
+
struct address_space_operations ll_aops = {
.readpage = ll_readpage,
// .readpages = ll_readpages,
-// .direct_IO = ll_direct_IO_26,
+ .direct_IO = ll_direct_IO_26,
.writepage = ll_writepage_26,
.writepages = generic_writepages,
.set_page_dirty = __set_page_dirty_nobuffers,
};
struct lov_request {
+ struct obd_info rq_oi;
+ struct lov_request_set *rq_rqset;
+
struct list_head rq_link;
- struct ldlm_extent rq_extent;
+
int rq_idx; /* index in lov->tgts array */
int rq_stripe; /* stripe number */
int rq_complete;
int rq_rc;
int rq_buflen; /* length of sub_md */
- struct obdo *rq_oa;
- struct lov_stripe_md *rq_md;
+
obd_count rq_oabufs;
obd_count rq_pgaidx;
};
struct lov_request_set {
+ struct obd_enqueue_info *set_ei;
+ struct obd_info *set_oi;
atomic_t set_refcount;
struct obd_export *set_exp;
+ /* XXX: There is @set_exp already, however obd_statfs gets obd_device
+ only. */
+ struct obd_device *set_obd;
int set_count;
int set_completes;
int set_success;
struct llog_cookie *set_cookies;
int set_cookie_sent;
- struct lov_stripe_md *set_md;
- struct obdo *set_oa;
struct obd_trans_info *set_oti;
obd_count set_oabufs;
struct brw_page *set_pga;
/* lov_request.c */
void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+void lov_update_set(struct lov_request_set *set,
+ struct lov_request *req, int rc);
int lov_update_common_set(struct lov_request_set *set,
struct lov_request *req, int rc);
-int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **ea,
- struct obdo *src_oa, struct obd_trans_info *oti,
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
+ struct lov_stripe_md **ea, struct obdo *src_oa,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset);
int lov_update_create_set(struct lov_request_set *set,
struct lov_request *req, int rc);
int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
-int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_count oa_bufs,
- struct brw_page *pga, struct obd_trans_info *oti,
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pga,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset);
int lov_fini_brw_set(struct lov_request_set *set);
-int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm,
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
struct lov_request_set **reqset);
int lov_fini_getattr_set(struct lov_request_set *set);
-int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm,
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obdo *src_oa, struct lov_stripe_md *lsm,
struct obd_trans_info *oti,
struct lov_request_set **reqset);
int lov_update_destroy_set(struct lov_request_set *set,
struct lov_request *req, int rc);
int lov_fini_destroy_set(struct lov_request_set *set);
-int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset);
int lov_update_setattr_set(struct lov_request_set *set,
struct lov_request *req, int rc);
int lov_fini_setattr_set(struct lov_request_set *set);
-int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_off start,
- obd_off end, struct obd_trans_info *oti,
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset);
-int lov_update_punch_set(struct lov_request_set *set, struct lov_request *req,
- int rc);
int lov_fini_punch_set(struct lov_request_set *set);
-int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa,
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
+ struct obdo *src_oa,
struct lov_stripe_md *lsm, obd_off start,
obd_off end, struct lov_request_set **reqset);
int lov_fini_sync_set(struct lov_request_set *set);
-int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm,
- ldlm_policy_data_t *policy, __u32 mode,
- struct lustre_handle *lockh,
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo,
struct lov_request_set **reqset);
-int lov_update_enqueue_set(struct lov_request_set *set,
- struct lov_request *req, int rc, int flags);
int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode);
-int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm,
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md *lsm,
ldlm_policy_data_t *policy, __u32 mode,
struct lustre_handle *lockh,
struct lov_request_set **reqset);
int lov_update_match_set(struct lov_request_set *set, struct lov_request *req,
int rc);
int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags);
-int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm,
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md *lsm,
__u32 mode, struct lustre_handle *lockh,
struct lov_request_set **reqset);
int lov_fini_cancel_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+ struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+ struct obd_statfs *lov_sfs, int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+ int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
/* lov_obd.c */
int lov_get_stripecnt(struct lov_obd *lov, int stripe_count);
/* Merge the lock value block(&lvb) attributes from each of the stripes in a
* file into a single lvb. It is expected that the caller initializes the
- * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
* the local client.
*
* If @kms_only is set then we do not consider the recently seen size (rss)
lov_size = lov_stripe_size(lsm, tmpsize, i);
if (lov_size > size)
size = lov_size;
- /* merge blocks, mtime, atime */
+ /* merge blocks, mtime, atime */
blocks += loi->loi_lvb.lvb_blocks;
if (loi->loi_lvb.lvb_mtime > current_mtime)
current_mtime = loi->loi_lvb.lvb_mtime;
lvb->lvb_size = size;
lvb->lvb_blocks = blocks;
- lvb->lvb_mtime = current_mtime;
- lvb->lvb_atime = current_atime;
- lvb->lvb_ctime = current_ctime;
+ lvb->lvb_mtime = current_mtime;
+ lvb->lvb_atime = current_atime;
+ lvb->lvb_ctime = current_ctime;
RETURN(0);
}
struct lov_stripe_md **ea, struct obd_trans_info *oti)
{
struct lov_obd *lov;
+ struct obd_info oinfo;
struct lov_request_set *set = NULL;
struct obd_statfs osfs;
unsigned long maxage;
}
maxage = cfs_time_shift(-lov->desc.ld_qos_maxage);
- obd_statfs(exp->exp_obd, &osfs, maxage);
+ obd_statfs_rqset(exp->exp_obd, &osfs, maxage);
- rc = lov_prep_create_set(exp, ea, src_oa, oti, &set);
+ rc = lov_prep_create_set(exp, &oinfo, ea, src_oa, oti, &set);
if (rc)
RETURN(rc);
list_for_each_entry(req, &set->set_list, rq_link) {
/* XXX: LOV STACKING: use real "obj_mdp" sub-data */
rc = obd_create(lov->tgts[req->rq_idx].ltd_exp,
- req->rq_oa, &req->rq_md, oti);
+ req->rq_oi.oi_oa, &req->rq_oi.oi_md, oti);
lov_update_create_set(set, req, rc);
}
rc = lov_fini_create_set(set, ea);
struct obd_export *md_exp)
{
struct lov_request_set *set;
+ struct obd_info oinfo;
struct lov_request *req;
struct list_head *pos;
struct lov_obd *lov;
- int rc = 0;
+ int rc = 0, err;
ENTRY;
ASSERT_LSM_MAGIC(lsm);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
+ if (oa->o_valid & OBD_MD_FLCOOKIE) {
+ LASSERT(oti);
+ LASSERT(oti->oti_logcookies);
+ }
+
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_destroy_set(exp, oa, lsm, oti, &set);
+ rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
if (rc)
RETURN(rc);
int err;
req = list_entry(pos, struct lov_request, rq_link);
- /* XXX update the cookie position */
- oti->oti_logcookies = set->set_cookies + req->rq_stripe;
- rc = obd_destroy(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
- NULL, oti, NULL);
- err = lov_update_common_set(set, req, rc);
+ if (oa->o_valid & OBD_MD_FLCOOKIE)
+ oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+ err = obd_destroy(lov->tgts[req->rq_idx].ltd_exp,
+ req->rq_oi.oi_oa, NULL, oti, NULL);
+ err = lov_update_common_set(set, req, err);
if (err) {
CERROR("error: destroying objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- set->set_oa->o_id, req->rq_oa->o_id,
- req->rq_idx, rc);
+ oa->o_id, req->rq_oi.oi_oa->o_id,
+ req->rq_idx, err);
if (!rc)
rc = err;
}
}
- rc = lov_fini_destroy_set(set);
+
if (rc == 0) {
LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
}
- RETURN(rc);
+ err = lov_fini_destroy_set(set);
+ RETURN(rc ? rc : err);
}
-static int lov_getattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm)
+static int lov_getattr(struct obd_export *exp, struct obd_info *oinfo)
{
struct lov_request_set *set;
struct lov_request *req;
int err = 0, rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_getattr_set(exp, oa, lsm, &set);
+ rc = lov_prep_getattr_set(exp, oinfo, &set);
if (rc)
RETURN(rc);
req = list_entry(pos, struct lov_request, rq_link);
CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
- "%u\n", oa->o_id, req->rq_stripe, req->rq_oa->o_id,
- req->rq_idx);
+ "%u\n", oinfo->oi_oa->o_id, req->rq_stripe,
+ req->rq_oi.oi_oa->o_id, req->rq_idx);
- rc = obd_getattr(lov->tgts[req->rq_idx].ltd_exp,
- req->rq_oa, NULL);
+ rc = obd_getattr(lov->tgts[req->rq_idx].ltd_exp, &req->rq_oi);
err = lov_update_common_set(set, req, rc);
if (err) {
CERROR("error: getattr objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- set->set_oa->o_id, req->rq_oa->o_id,
+ oinfo->oi_oa->o_id, req->rq_oi.oi_oa->o_id,
req->rq_idx, err);
break;
}
RETURN(rc);
}
-static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, void *data,
- int rc)
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
{
struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
ENTRY;
/* don't do attribute merge if this aysnc op failed */
- if (rc) {
+ if (rc)
lovset->set_completes = 0;
- lov_fini_getattr_set(lovset);
- } else {
- rc = lov_fini_getattr_set(lovset);
- }
- RETURN (rc);
+ err = lov_fini_getattr_set(lovset);
+ RETURN(rc ? rc : err);
}
-static int lov_getattr_async(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm,
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
struct ptlrpc_request_set *rqset)
{
struct lov_request_set *lovset;
struct lov_obd *lov;
struct list_head *pos;
struct lov_request *req;
- int rc = 0;
+ int rc = 0, err;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_getattr_set(exp, oa, lsm, &lovset);
+ rc = lov_prep_getattr_set(exp, oinfo, &lovset);
if (rc)
RETURN(rc);
CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n",
- lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size);
+ oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count,
+ oinfo->oi_md->lsm_stripe_size);
list_for_each (pos, &lovset->set_list) {
req = list_entry(pos, struct lov_request, rq_link);
CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
- "%u\n", oa->o_id, req->rq_stripe, req->rq_oa->o_id,
- req->rq_idx);
+ "%u\n", oinfo->oi_oa->o_id, req->rq_stripe,
+ req->rq_oi.oi_oa->o_id, req->rq_idx);
rc = obd_getattr_async(lov->tgts[req->rq_idx].ltd_exp,
- req->rq_oa, NULL, rqset);
+ &req->rq_oi, rqset);
if (rc) {
CERROR("error: getattr objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- lovset->set_oa->o_id, req->rq_oa->o_id,
+ oinfo->oi_oa->o_id, req->rq_oi.oi_oa->o_id,
req->rq_idx, rc);
GOTO(out, rc);
}
- lov_update_common_set(lovset, req, rc);
}
- LASSERT(rc == 0);
- LASSERT (rqset->set_interpret == NULL);
- rqset->set_interpret = lov_getattr_interpret;
- rqset->set_arg = (void *)lovset;
- RETURN(rc);
+ if (!list_empty(&rqset->set_requests)) {
+ LASSERT(rc == 0);
+ LASSERT (rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_getattr_interpret;
+ rqset->set_arg = (void *)lovset;
+ RETURN(rc);
+ }
out:
- LASSERT(rc);
- lov_fini_getattr_set(lovset);
- RETURN(rc);
+ if (rc)
+ lovset->set_completes = 0;
+ err = lov_fini_getattr_set(lovset);
+ RETURN(rc ? rc : err);
}
-static int lov_setattr(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+static int lov_setattr(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti)
{
struct lov_request_set *set;
struct lov_obd *lov;
int err = 0, rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
/* for now, we only expect the following updates here */
- LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
- OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
- OBD_MD_FLFLAGS | OBD_MD_FLSIZE | OBD_MD_FLGROUP |
- OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLINLINE |
- OBD_MD_FLFID | OBD_MD_FLGENER)));
+ LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE |
+ OBD_MD_FLMODE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLFLAGS | OBD_MD_FLSIZE |
+ OBD_MD_FLGROUP | OBD_MD_FLUID |
+ OBD_MD_FLGID | OBD_MD_FLINLINE |
+ OBD_MD_FLFID | OBD_MD_FLGENER)));
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_setattr_set(exp, src_oa, lsm, oti, &set);
+ rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
if (rc)
RETURN(rc);
list_for_each (pos, &set->set_list) {
req = list_entry(pos, struct lov_request, rq_link);
- rc = obd_setattr(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
- NULL, NULL);
+ rc = obd_setattr(lov->tgts[req->rq_idx].ltd_exp,
+ &req->rq_oi, NULL);
err = lov_update_setattr_set(set, req, rc);
if (err) {
CERROR("error: setattr objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- set->set_oa->o_id, req->rq_oa->o_id,
- req->rq_idx, err);
+ set->set_oi->oi_oa->o_id,
+ req->rq_oi.oi_oa->o_id, req->rq_idx, err);
if (!rc)
rc = err;
}
RETURN(rc);
}
-static int lov_setattr_async(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+ ENTRY;
+
+ if (rc)
+ lovset->set_completes = 0;
+ err = lov_fini_setattr_set(lovset);
+ RETURN(rc ? rc : err);
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+ needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
{
+ struct lov_request_set *set;
+ struct lov_request *req;
+ struct list_head *pos;
struct lov_obd *lov;
- struct lov_oinfo *loi = NULL;
- int rc = 0, err;
- obd_id objid = src_oa->o_id;
- int i;
+ int rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
- LASSERT(oti);
- if (src_oa->o_valid & OBD_MD_FLCOOKIE)
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+ LASSERT(oti);
LASSERT(oti->oti_logcookies);
+ }
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
- LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLUID |
- OBD_MD_FLGID| OBD_MD_FLCOOKIE |
- OBD_MD_FLFID | OBD_MD_FLGENER)));
lov = &exp->exp_obd->u.lov;
+ rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+ if (rc)
+ RETURN(rc);
- loi = lsm->lsm_oinfo;
- for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
- if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) {
- CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
- goto next;
- }
+ CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n",
+ oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count,
+ oinfo->oi_md->lsm_stripe_size);
+
+ list_for_each (pos, &set->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
- src_oa->o_id = loi->loi_id;
- src_oa->o_stripe_idx = i;
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+ oti->oti_logcookies = set->set_cookies + req->rq_stripe;
- /* do chown/chgrp on OST asynchronously */
- err = obd_setattr_async(lov->tgts[loi->loi_ost_idx].ltd_exp,
- src_oa, NULL, oti);
- if (err) {
+ CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
+ "%u\n", oinfo->oi_oa->o_id, req->rq_stripe,
+ req->rq_oi.oi_oa->o_id, req->rq_idx);
+
+ rc = obd_setattr_async(lov->tgts[req->rq_idx].ltd_exp,
+ &req->rq_oi, oti, rqset);
+ if (rc) {
CERROR("error: setattr objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- objid, src_oa->o_id, i, err);
- if (!rc)
- rc = err;
+ set->set_oi->oi_oa->o_id,
+ req->rq_oi.oi_oa->o_id,
+ req->rq_idx, rc);
+ break;
}
- next:
- if (src_oa->o_valid & OBD_MD_FLCOOKIE)
- oti->oti_logcookies++;
}
- RETURN(rc);
+ /* If we are not waiting for responses on async requests, return. */
+ if (rc || !rqset || list_empty(&rqset->set_requests)) {
+ int err;
+ if (rc)
+ set->set_completes = 0;
+ err = lov_fini_setattr_set(set);
+ RETURN(rc ? rc : err);
+ }
+
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_setattr_interpret;
+ rqset->set_arg = (void *)set;
+
+ RETURN(0);
+}
+
+static int lov_punch_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+ ENTRY;
+
+ if (rc)
+ lovset->set_completes = 0;
+ err = lov_fini_punch_set(lovset);
+ RETURN(rc ? rc : err);
}
/* FIXME: maybe we'll just make one node the authoritative attribute node, then
* we can send this 'punch' to just the authoritative node and the nodes
* that the punch will affect. */
-static int lov_punch(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm,
- obd_off start, obd_off end, struct obd_trans_info *oti)
+static int lov_punch(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
{
struct lov_request_set *set;
struct lov_obd *lov;
struct list_head *pos;
struct lov_request *req;
- int err = 0, rc = 0;
+ int rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_punch_set(exp, oa, lsm, start, end, oti, &set);
+ rc = lov_prep_punch_set(exp, oinfo, oti, &set);
if (rc)
RETURN(rc);
list_for_each (pos, &set->set_list) {
req = list_entry(pos, struct lov_request, rq_link);
- rc = obd_punch(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
- NULL, req->rq_extent.start,
- req->rq_extent.end, NULL);
- err = lov_update_punch_set(set, req, rc);
- if (err) {
+ rc = obd_punch(lov->tgts[req->rq_idx].ltd_exp,
+ &req->rq_oi, NULL, rqset);
+ if (rc) {
CERROR("error: punch objid "LPX64" subobj "LPX64
- " on OST idx %d: rc = %d\n", set->set_oa->o_id,
- req->rq_oa->o_id, req->rq_idx, rc);
- if (!rc)
- rc = err;
+ " on OST idx %d: rc = %d\n",
+ set->set_oi->oi_oa->o_id,
+ req->rq_oi.oi_oa->o_id, req->rq_idx, rc);
+ break;
}
}
- err = lov_fini_punch_set(set);
- if (!rc)
- rc = err;
- RETURN(rc);
+
+ if (rc || list_empty(&rqset->set_requests)) {
+ int err;
+ err = lov_fini_punch_set(set);
+ RETURN(rc ? rc : err);
+ }
+
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_punch_interpret;
+ rqset->set_arg = (void *)set;
+
+ RETURN(0);
}
static int lov_sync(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *lsm, obd_off start, obd_off end)
{
struct lov_request_set *set;
+ struct obd_info oinfo;
struct lov_obd *lov;
struct list_head *pos;
struct lov_request *req;
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_sync_set(exp, oa, lsm, start, end, &set);
+ rc = lov_prep_sync_set(exp, &oinfo, oa, lsm, start, end, &set);
if (rc)
RETURN(rc);
list_for_each (pos, &set->set_list) {
req = list_entry(pos, struct lov_request, rq_link);
- rc = obd_sync(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
- NULL, req->rq_extent.start, req->rq_extent.end);
+ rc = obd_sync(lov->tgts[req->rq_idx].ltd_exp, req->rq_oi.oi_oa,
+ NULL, req->rq_oi.oi_policy.l_extent.start,
+ req->rq_oi.oi_policy.l_extent.end);
err = lov_update_common_set(set, req, rc);
if (err) {
CERROR("error: fsync objid "LPX64" subobj "LPX64
- " on OST idx %d: rc = %d\n", set->set_oa->o_id,
- req->rq_oa->o_id, req->rq_idx, rc);
+ " on OST idx %d: rc = %d\n",
+ set->set_oi->oi_oa->o_id,
+ req->rq_oi.oi_oa->o_id, req->rq_idx, rc);
if (!rc)
rc = err;
}
RETURN(rc);
}
-static int lov_brw_check(struct lov_obd *lov, struct obdo *oa,
- struct lov_stripe_md *lsm,
+static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
obd_count oa_bufs, struct brw_page *pga)
{
+ struct obd_info oinfo = { { { 0 } } };
int i, rc = 0;
+ oinfo.oi_oa = lov_oinfo->oi_oa;
+
/* The caller just wants to know if there's a chance that this
* I/O can succeed */
for (i = 0; i < oa_bufs; i++) {
- int stripe = lov_stripe_number(lsm, pga[i].off);
- int ost = lsm->lsm_oinfo[stripe].loi_ost_idx;
+ int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off);
+ int ost = lov_oinfo->oi_md->lsm_oinfo[stripe].loi_ost_idx;
obd_off start, end;
- if (!lov_stripe_intersects(lsm, i, pga[i].off,
+ if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
pga[i].off + pga[i].count,
&start, &end))
continue;
CDEBUG(D_HA, "lov idx %d inactive\n", ost);
return -EIO;
}
- rc = obd_brw(OBD_BRW_CHECK, lov->tgts[ost].ltd_exp, oa,
- NULL, 1, &pga[i], NULL);
+
+ rc = obd_brw(OBD_BRW_CHECK, lov->tgts[ost].ltd_exp, &oinfo,
+ 1, &pga[i], NULL);
if (rc)
break;
}
return rc;
}
-static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_count oa_bufs,
- struct brw_page *pga, struct obd_trans_info *oti)
+static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pga,
+ struct obd_trans_info *oti)
{
struct lov_request_set *set;
struct lov_request *req;
int err, rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (cmd == OBD_BRW_CHECK) {
- rc = lov_brw_check(lov, src_oa, lsm, oa_bufs, pga);
+ rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
RETURN(rc);
}
- rc = lov_prep_brw_set(exp, src_oa, lsm, oa_bufs, pga, oti, &set);
+ rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set);
if (rc)
RETURN(rc);
sub_exp = lov->tgts[req->rq_idx].ltd_exp;
sub_pga = set->set_pga + req->rq_pgaidx;
- rc = obd_brw(cmd, sub_exp, req->rq_oa, req->rq_md,
- req->rq_oabufs, sub_pga, oti);
+ rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+ sub_pga, oti);
if (rc)
break;
lov_update_common_set(set, req, rc);
RETURN(rc);
}
-static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_count oa_bufs,
- struct brw_page *pga, struct ptlrpc_request_set *set,
- struct obd_trans_info *oti)
+static int lov_brw_async(int cmd, struct obd_export *exp,
+ struct obd_info *oinfo, obd_count oa_bufs,
+ struct brw_page *pga, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
{
struct lov_request_set *lovset;
struct lov_request *req;
int rc = 0;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
if (cmd == OBD_BRW_CHECK) {
- rc = lov_brw_check(lov, oa, lsm, oa_bufs, pga);
+ rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
RETURN(rc);
}
- rc = lov_prep_brw_set(exp, oa, lsm, oa_bufs, pga, oti, &lovset);
+ rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &lovset);
if (rc)
RETURN(rc);
sub_exp = lov->tgts[req->rq_idx].ltd_exp;
sub_pga = lovset->set_pga + req->rq_pgaidx;
- rc = obd_brw_async(cmd, sub_exp, req->rq_oa, req->rq_md,
- req->rq_oabufs, sub_pga, set, oti);
+ rc = obd_brw_async(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+ sub_pga, oti, set);
if (rc)
GOTO(out, rc);
lov_update_common_set(lovset, req, rc);
RETURN(rc);
}
-static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
- __u32 type, ldlm_policy_data_t *policy, __u32 mode,
- int *flags, void *bl_cb, void *cp_cb, void *gl_cb,
- void *data,__u32 lvb_len, void *lvb_swabber,
- struct lustre_handle *lockh)
+static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+ ENTRY;
+
+ if (rc)
+ lovset->set_completes = 0;
+ err = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode);
+ RETURN(rc ? rc : err);
+}
+
+static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo)
{
struct lov_request_set *set;
struct lov_request *req;
struct list_head *pos;
- struct lustre_handle *lov_lockhp;
struct lov_obd *lov;
ldlm_error_t rc;
- int save_flags = *flags;
ENTRY;
- ASSERT_LSM_MAGIC(lsm);
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
/* we should never be asked to replay a lock this way. */
- LASSERT((*flags & LDLM_FL_REPLAY) == 0);
+ LASSERT((einfo->ei_flags & LDLM_FL_REPLAY) == 0);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_enqueue_set(exp, lsm, policy, mode, lockh, &set);
+ rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set);
if (rc)
RETURN(rc);
list_for_each (pos, &set->set_list) {
- ldlm_policy_data_t sub_policy;
req = list_entry(pos, struct lov_request, rq_link);
- lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
- LASSERT(lov_lockhp);
-
- *flags = save_flags;
- sub_policy.l_extent = req->rq_extent;
- rc = obd_enqueue(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
- type, &sub_policy, mode, flags, bl_cb,
- cp_cb, gl_cb, data, lvb_len, lvb_swabber,
- lov_lockhp);
- rc = lov_update_enqueue_set(set, req, rc, save_flags);
+ rc = obd_enqueue(lov->tgts[req->rq_idx].ltd_exp,
+ &req->rq_oi, einfo);
if (rc != ELDLM_OK)
- break;
+ GOTO(out, rc);
}
- lov_fini_enqueue_set(set, mode);
+ if (einfo->ei_rqset && !list_empty(&einfo->ei_rqset->set_requests)) {
+ LASSERT(rc == 0);
+ LASSERT(einfo->ei_rqset->set_interpret == NULL);
+ einfo->ei_rqset->set_interpret = lov_enqueue_interpret;
+ einfo->ei_rqset->set_arg = (void *)set;
+ RETURN(rc);
+ }
+out:
+ if (rc)
+ set->set_completes = 0;
+ lov_fini_enqueue_set(set, einfo->ei_mode);
RETURN(rc);
}
int *flags, void *data, struct lustre_handle *lockh)
{
struct lov_request_set *set;
+ struct obd_info oinfo;
struct lov_request *req;
struct list_head *pos;
struct lov_obd *lov = &exp->exp_obd->u.lov;
RETURN(-ENODEV);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_match_set(exp, lsm, policy, mode, lockh, &set);
+ rc = lov_prep_match_set(exp, &oinfo, lsm, policy, mode, lockh, &set);
if (rc)
RETURN(rc);
LASSERT(lov_lockhp);
lov_flags = *flags;
- sub_policy.l_extent = req->rq_extent;
+ sub_policy.l_extent = req->rq_oi.oi_policy.l_extent;
- rc = obd_match(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
- type, &sub_policy, mode, &lov_flags, data,
- lov_lockhp);
+ rc = obd_match(lov->tgts[req->rq_idx].ltd_exp,
+ req->rq_oi.oi_md, type, &sub_policy,
+ mode, &lov_flags, data, lov_lockhp);
rc = lov_update_match_set(set, req, rc);
if (rc != 1)
break;
__u32 mode, struct lustre_handle *lockh)
{
struct lov_request_set *set;
+ struct obd_info oinfo;
struct lov_request *req;
struct list_head *pos;
struct lov_obd *lov = &exp->exp_obd->u.lov;
LASSERT(lockh);
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_cancel_set(exp, lsm, mode, lockh, &set);
+ rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set);
if (rc)
RETURN(rc);
req = list_entry(pos, struct lov_request, rq_link);
lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
- rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
- mode, lov_lockhp);
+ rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp,
+ req->rq_oi.oi_md, mode, lov_lockhp);
rc = lov_update_common_set(set, req, rc);
if (rc) {
CERROR("error: cancel objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
lsm->lsm_object_id,
- req->rq_md->lsm_object_id, req->rq_idx, rc);
+ req->rq_oi.oi_md->lsm_object_id,
+ req->rq_idx, rc);
err = rc;
}
RETURN(count);
}
-#define LOV_U64_MAX ((__u64)~0ULL)
-#define LOV_SUM_MAX(tot, add) \
- do { \
- if ((tot) + (add) < (tot)) \
- (tot) = LOV_U64_MAX; \
- else \
- (tot) += (add); \
- } while(0)
+static int lov_statfs_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+ ENTRY;
+
+ if (rc)
+ lovset->set_completes = 0;
+
+ err = lov_fini_statfs_set(lovset);
+ RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_device *obd, struct obd_info *oinfo,
+ unsigned long max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ struct lov_request_set *set;
+ struct lov_request *req;
+ struct list_head *pos;
+ struct lov_obd *lov;
+ int rc = 0;
+
+ LASSERT(oinfo != NULL);
+ LASSERT(oinfo->oi_osfs != NULL);
+
+ lov = &obd->u.lov;
+ rc = lov_prep_statfs_set(obd, oinfo, &set);
+ if (rc)
+ RETURN(rc);
+
+ list_for_each (pos, &set->set_list) {
+ struct obd_device *osc_obd;
+
+ req = list_entry(pos, struct lov_request, rq_link);
+
+ osc_obd = class_exp2obd(lov->tgts[req->rq_idx].ltd_exp);
+ rc = obd_statfs_async(osc_obd, &req->rq_oi, max_age, rqset);
+ if (rc)
+ break;
+ }
+
+ if (rc || list_empty(&rqset->set_requests)) {
+ int err;
+ if (rc)
+ set->set_completes = 0;
+ err = lov_fini_statfs_set(set);
+ RETURN(rc ? rc : err);
+ }
+
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_statfs_interpret;
+ rqset->set_arg = (void *)set;
+ RETURN(0);
+}
static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs,
cfs_time_t max_age)
struct lov_obd *lov = &obd->u.lov;
struct obd_statfs lov_sfs;
int set = 0;
- int rc = 0;
+ int rc = 0, err;
int i;
ENTRY;
-
/* We only get block data from the OBD */
for (i = 0; i < lov->desc.ld_tgt_count; i++) {
- int err;
if (!lov->tgts[i].ltd_active) {
CDEBUG(D_HA, "lov idx %d inactive\n", i);
continue;
}
qos_update(lov, i, &lov_sfs);
- if (!set) {
- memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
- set = 1;
- } else {
-#ifdef MIN_DF
- /* Sandia requested that df (and so, statfs) only
- returned minimal available space on
- a single OST, so people would be able to
- write this much data guaranteed. */
- if (osfs->os_bavail > lov_sfs.os_bavail) {
- /* Presumably if new bavail is smaller,
- new bfree is bigger as well */
- osfs->os_bfree = lov_sfs.os_bfree;
- osfs->os_bavail = lov_sfs.os_bavail;
- }
-#else
- osfs->os_bfree += lov_sfs.os_bfree;
- osfs->os_bavail += lov_sfs.os_bavail;
-#endif
- osfs->os_blocks += lov_sfs.os_blocks;
- /* XXX not sure about this one - depends on policy.
- * - could be minimum if we always stripe on all OBDs
- * (but that would be wrong for any other policy,
- * if one of the OBDs has no more objects left)
- * - could be sum if we stripe whole objects
- * - could be average, just to give a nice number
- *
- * To give a "reasonable" (if not wholly accurate)
- * number, we divide the total number of free objects
- * by expected stripe count (watch out for overflow).
- */
- LOV_SUM_MAX(osfs->os_files, lov_sfs.os_files);
- LOV_SUM_MAX(osfs->os_ffree, lov_sfs.os_ffree);
- }
+ lov_update_statfs(class_exp2obd(lov->tgts[i].ltd_exp),
+ osfs, &lov_sfs, set);
+ set++;
}
- if (set) {
- __u32 expected_stripes = lov_get_stripecnt(lov, 0);
-
- if (osfs->os_files != LOV_U64_MAX)
- do_div(osfs->os_files, expected_stripes);
- if (osfs->os_ffree != LOV_U64_MAX)
- do_div(osfs->os_ffree, expected_stripes);
- } else if (!rc)
- rc = -EIO;
-
- RETURN(rc);
+ err = lov_fini_statfs(obd, osfs, set);
+ RETURN(rc ? rc : err);
}
static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
switch (cmd) {
case OBD_IOC_LOV_GET_CONFIG: {
- struct obd_ioctl_data *data = karg;
+ struct obd_ioctl_data *data;
struct lov_tgt_desc *tgtdesc;
struct lov_desc *desc;
char *buf = NULL;
__u32 *genp;
- buf = NULL;
len = 0;
if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
RETURN(-EINVAL);
LASSERT(md_exp);
LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
rc = lsm_op_find(lsm->lsm_magic)->lsm_revalidate(lsm, md_exp->exp_obd);
-
+
RETURN(rc);
}
.o_connect = lov_connect,
.o_disconnect = lov_disconnect,
.o_statfs = lov_statfs,
+ .o_statfs_async = lov_statfs_async,
.o_packmd = lov_packmd,
.o_unpackmd = lov_unpackmd,
.o_checkmd = lov_checkmd,
rc = class_register_type(&lov_obd_ops, lvars.module_vars,
LUSTRE_LOV_NAME);
if (rc && quota_interface)
- PORTAL_SYMBOL_PUT(osc_quota_interface);
+ PORTAL_SYMBOL_PUT(lov_quota_interface);
RETURN(rc);
}
struct lov_ost_data_v1 *lod;
int i;
- CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n",
+ CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n",
le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic),
le32_to_cpu(lmm->lmm_pattern));
CDEBUG(level,"stripe_size %u, stripe_count %u\n",
void qos_shrink_lsm(struct lov_request_set *set)
{
- struct lov_stripe_md *lsm = set->set_md, *lsm_new;
+ struct lov_stripe_md *lsm = set->set_oi->oi_md, *lsm_new;
/* XXX LOV STACKING call into osc for sizes */
unsigned oldsize, newsize;
memcpy(lsm_new, lsm, newsize);
lsm_new->lsm_stripe_count = set->set_count;
OBD_FREE(lsm, oldsize);
- set->set_md = lsm_new;
+ set->set_oi->oi_md = lsm_new;
} else {
CWARN("'leaking' %d bytes\n", oldsize - newsize);
}
int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
{
- struct lov_stripe_md *lsm = set->set_md;
+ struct lov_stripe_md *lsm = set->set_oi->oi_md;
struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
int stripe, i, rc = -EIO;
if (stripe >= lsm->lsm_stripe_count) {
req->rq_idx = ost_idx;
- rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa,
- &req->rq_md, set->set_oti);
+ rc = obd_create(lov->tgts[ost_idx].ltd_exp,
+ req->rq_oi.oi_oa, &req->rq_oi.oi_md,
+ set->set_oti);
if (!rc)
break;
}
{
struct lov_obd *lov = &exp->exp_obd->u.lov;
struct lov_stripe_md *lsm;
- struct obdo *src_oa = set->set_oa;
+ struct obdo *src_oa = set->set_oi->oi_oa;
struct obd_trans_info *oti = set->set_oti;
int i, stripes, rc = 0, newea = 0;
int *idx_arr, idx_cnt = 0;
LASSERT(src_oa->o_valid & OBD_MD_FLID);
- if (set->set_md == NULL) {
+ if (set->set_oi->oi_md == NULL) {
int stripe_cnt = lov_get_stripecnt(lov, 0);
/* If the MDS file was truncated up to some size, stripe over
stripes = stripe_cnt;
}
- rc = lov_alloc_memmd(&set->set_md, stripes,
+ rc = lov_alloc_memmd(&set->set_oi->oi_md, stripes,
lov->desc.ld_pattern ?
lov->desc.ld_pattern : LOV_PATTERN_RAID0,
LOV_MAGIC);
rc = 0;
newea = 1;
}
- lsm = set->set_md;
-
+
+ lsm = set->set_oi->oi_md;
lsm->lsm_object_id = src_oa->o_id;
if (!lsm->lsm_stripe_size)
lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
GOTO(out_err, rc = -ENOMEM);
lov_set_add_req(req, set);
- req->rq_buflen = sizeof(*req->rq_md);
- OBD_ALLOC(req->rq_md, req->rq_buflen);
- if (req->rq_md == NULL)
+ req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+ OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_md == NULL)
GOTO(out_err, rc = -ENOMEM);
-
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL)
+
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL)
GOTO(out_err, rc = -ENOMEM);
-
+
req->rq_idx = ost_idx;
req->rq_stripe = i;
/* create data objects with "parent" OA */
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
+ memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
/* XXX When we start creating objects on demand, we need to
* make sure that we always create the object on the
* stripe which holds the existing file size.
*/
if (src_oa->o_valid & OBD_MD_FLSIZE) {
- req->rq_oa->o_size =
+ req->rq_oi.oi_oa->o_size =
lov_size_to_stripe(lsm, src_oa->o_size, i);
CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
- i, req->rq_oa->o_size, src_oa->o_size);
+ i, req->rq_oi.oi_oa->o_size, src_oa->o_size);
}
}
}
out_err:
if (newea && rc)
- obd_free_memmd(exp, &set->set_md);
+ obd_free_memmd(exp, &set->set_oi->oi_md);
free_idx_array(idx_arr, idx_cnt);
EXIT;
return rc;
rq_link);
list_del_init(&req->rq_link);
- if (req->rq_oa)
- obdo_free(req->rq_oa);
- if (req->rq_md)
- OBD_FREE(req->rq_md, req->rq_buflen);
+ if (req->rq_oi.oi_oa)
+ obdo_free(req->rq_oi.oi_oa);
+ if (req->rq_oi.oi_md)
+ OBD_FREE(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_osfs)
+ OBD_FREE(req->rq_oi.oi_osfs,
+ sizeof(*req->rq_oi.oi_osfs));
OBD_FREE(req, sizeof(*req));
}
EXIT;
}
-static void lov_update_set(struct lov_request_set *set,
- struct lov_request *req, int rc)
+void lov_update_set(struct lov_request_set *set,
+ struct lov_request *req, int rc)
{
req->rq_complete = 1;
req->rq_rc = rc;
set->set_count++;
}
-int lov_update_enqueue_set(struct lov_request_set *set,
- struct lov_request *req, int rc, int flags)
+int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
{
+ struct lov_request_set *set = req->rq_rqset;
struct lustre_handle *lov_lockhp;
struct lov_oinfo *loi;
ENTRY;
+ LASSERT(set != NULL);
+ LASSERT(set->set_oi != NULL);
+
lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
- loi = &set->set_md->lsm_oinfo[req->rq_stripe];
+ loi = &set->set_oi->oi_md->lsm_oinfo[req->rq_stripe];
- /* XXX FIXME: This unpleasantness doesn't belong here at *all*.
- * It belongs in the OSC, except that the OSC doesn't have
- * access to the real LOI -- it gets a copy, that we created
- * above, and that copy can be arbitrarily out of date.
+ /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
+ * and that copy can be arbitrarily out of date.
*
* The LOV API is due for a serious rewriting anyways, and this
* can be addressed then. */
+
if (rc == ELDLM_OK) {
struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
__u64 tmp;
LASSERT(lock != NULL);
- lov_stripe_lock(set->set_md);
- loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb;
+ lov_stripe_lock(set->set_oi->oi_md);
+ loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo->loi_lvb;
tmp = loi->loi_lvb.lvb_size;
/* Extend KMS up to the end of this lock and no further
* A lock on [x,y] means a KMS of up to y + 1 bytes! */
loi->loi_lvb.lvb_size, loi->loi_kms,
lock->l_policy_data.l_extent.end);
}
- lov_stripe_unlock(set->set_md);
+ lov_stripe_unlock(set->set_oi->oi_md);
ldlm_lock_allow_match(lock);
LDLM_LOCK_PUT(lock);
- } else if (rc == ELDLM_LOCK_ABORTED && flags & LDLM_FL_HAS_INTENT) {
+ } else if ((rc == ELDLM_LOCK_ABORTED) &&
+ (set->set_ei->ei_flags & LDLM_FL_HAS_INTENT)) {
memset(lov_lockhp, 0, sizeof(*lov_lockhp));
- lov_stripe_lock(set->set_md);
- loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb;
- lov_stripe_unlock(set->set_md);
+ lov_stripe_lock(set->set_oi->oi_md);
+ loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo->loi_lvb;
+ lov_stripe_unlock(set->set_oi->oi_md);
CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
" kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
rc = ELDLM_OK;
if (lov->tgts[req->rq_idx].ltd_active) {
CERROR("error: enqueue objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- set->set_md->lsm_object_id, loi->loi_id,
- loi->loi_ost_idx, rc);
+ set->set_oi->oi_md->lsm_object_id,
+ loi->loi_id, loi->loi_ost_idx, rc);
} else {
rc = ELDLM_OK;
}
RETURN(rc);
}
+/* The callback for osc_enqueue that updates lov info for every OSC request. */
+static int cb_update_enqueue(struct obd_info *oinfo, int rc)
+{
+ struct obd_enqueue_info *einfo;
+ struct lov_request *lovreq;
+
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ einfo = lovreq->rq_rqset->set_ei;
+ return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc);
+}
+
static int enqueue_done(struct lov_request_set *set, __u32 mode)
{
struct lov_request *req;
- struct lustre_handle *lov_lockhp = NULL;
struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
int rc = 0;
ENTRY;
/* cancel enqueued/matched locks */
list_for_each_entry(req, &set->set_list, rq_link) {
+ struct lustre_handle *lov_lockhp;
+
if (!req->rq_complete || req->rq_rc)
continue;
if (!lustre_handle_is_used(lov_lockhp))
continue;
- rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
- mode, lov_lockhp);
+ rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp,
+ req->rq_oi.oi_md, mode, lov_lockhp);
if (rc && lov->tgts[req->rq_idx].ltd_active)
CERROR("cancelling obdjid "LPX64" on OST "
"idx %d error: rc = %d\n",
- req->rq_md->lsm_object_id, req->rq_idx, rc);
+ req->rq_oi.oi_md->lsm_object_id,
+ req->rq_idx, rc);
}
lov_llh_put(set->set_lockh);
RETURN(rc);
if (set == NULL)
RETURN(0);
LASSERT(set->set_exp);
- if (set->set_completes)
+ /* Do enqueue_done only for sync requests and if any request
+ succeeded. */
+ if (!set->set_ei->ei_rqset && set->set_completes)
rc = enqueue_done(set, mode);
else
lov_llh_put(set->set_lockh);
RETURN(rc);
}
-int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm,
- ldlm_policy_data_t *policy, __u32 mode,
- struct lustre_handle *lockh,
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo,
struct lov_request_set **reqset)
{
struct lov_obd *lov = &exp->exp_obd->u.lov;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_lockh = lov_llh_new(lsm);
+ set->set_oi = oinfo;
+ set->set_ei = einfo;
+ set->set_lockh = lov_llh_new(oinfo->oi_md);
if (set->set_lockh == NULL)
GOTO(out_set, rc = -ENOMEM);
- lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+ oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie;
- loi = lsm->lsm_oinfo;
- for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
+ loi = oinfo->oi_md->lsm_oinfo;
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) {
struct lov_request *req;
obd_off start, end;
- if (!lov_stripe_intersects(lsm, i, policy->l_extent.start,
- policy->l_extent.end, &start, &end))
+ if (!lov_stripe_intersects(oinfo->oi_md, i,
+ oinfo->oi_policy.l_extent.start,
+ oinfo->oi_policy.l_extent.end,
+ &start, &end))
continue;
if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) {
if (req == NULL)
GOTO(out_set, rc = -ENOMEM);
- req->rq_buflen = sizeof(*req->rq_md) +
+ req->rq_buflen = sizeof(*req->rq_oi.oi_md) +
sizeof(struct lov_oinfo);
- OBD_ALLOC(req->rq_md, req->rq_buflen);
- if (req->rq_md == NULL) {
+ OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_md == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- req->rq_extent.start = start;
- req->rq_extent.end = end;
- req->rq_extent.gid = policy->l_extent.gid;
+ req->rq_rqset = set;
+ /* Set lov request specific parameters. */
+ req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
+ req->rq_oi.oi_cb_up = cb_update_enqueue;
+
+ LASSERT(req->rq_oi.oi_lockh);
+
+ req->rq_oi.oi_policy.l_extent.gid =
+ oinfo->oi_policy.l_extent.gid;
+ req->rq_oi.oi_policy.l_extent.start = start;
+ req->rq_oi.oi_policy.l_extent.end = end;
req->rq_idx = loi->loi_ost_idx;
req->rq_stripe = i;
/* XXX LOV STACKING: submd should be from the subobj */
- req->rq_md->lsm_object_id = loi->loi_id;
- req->rq_md->lsm_stripe_count = 0;
- req->rq_md->lsm_oinfo->loi_kms_valid = loi->loi_kms_valid;
- req->rq_md->lsm_oinfo->loi_kms = loi->loi_kms;
- req->rq_md->lsm_oinfo->loi_lvb = loi->loi_lvb;
+ req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+ req->rq_oi.oi_md->lsm_stripe_count = 0;
+ req->rq_oi.oi_md->lsm_oinfo->loi_kms_valid =
+ loi->loi_kms_valid;
+ req->rq_oi.oi_md->lsm_oinfo->loi_kms = loi->loi_kms;
+ req->rq_oi.oi_md->lsm_oinfo->loi_lvb = loi->loi_lvb;
lov_set_add_req(req, set);
}
*reqset = set;
RETURN(0);
out_set:
- lov_fini_enqueue_set(set, mode);
+ lov_fini_enqueue_set(set, einfo->ei_mode);
RETURN(rc);
}
RETURN(rc);
}
-int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm,
- ldlm_policy_data_t *policy, __u32 mode,
- struct lustre_handle *lockh,
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md *lsm, ldlm_policy_data_t *policy,
+ __u32 mode, struct lustre_handle *lockh,
struct lov_request_set **reqset)
{
struct lov_obd *lov = &exp->exp_obd->u.lov;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = lsm;
set->set_lockh = lov_llh_new(lsm);
if (set->set_lockh == NULL)
GOTO(out_set, rc = -ENOMEM);
if (req == NULL)
GOTO(out_set, rc = -ENOMEM);
- req->rq_buflen = sizeof(*req->rq_md);
- OBD_ALLOC(req->rq_md, req->rq_buflen);
- if (req->rq_md == NULL) {
+ req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+ OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_md == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- req->rq_extent.start = start;
- req->rq_extent.end = end;
- req->rq_extent.gid = policy->l_extent.gid;
+ req->rq_oi.oi_policy.l_extent.start = start;
+ req->rq_oi.oi_policy.l_extent.end = end;
+ req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid;
req->rq_idx = loi->loi_ost_idx;
req->rq_stripe = i;
/* XXX LOV STACKING: submd should be from the subobj */
- req->rq_md->lsm_object_id = loi->loi_id;
- req->rq_md->lsm_stripe_count = 0;
+ req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+ req->rq_oi.oi_md->lsm_stripe_count = 0;
lov_set_add_req(req, set);
}
RETURN(rc);
}
-int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm,
- __u32 mode, struct lustre_handle *lockh,
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md *lsm, __u32 mode,
+ struct lustre_handle *lockh,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = lsm;
set->set_lockh = lov_handle2llh(lockh);
if (set->set_lockh == NULL) {
CERROR("LOV: invalid lov lock handle %p\n", lockh);
if (req == NULL)
GOTO(out_set, rc = -ENOMEM);
- req->rq_buflen = sizeof(*req->rq_md);
- OBD_ALLOC(req->rq_md, req->rq_buflen);
- if (req->rq_md == NULL) {
+ req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+ OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_md == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
req->rq_stripe = i;
/* XXX LOV STACKING: submd should be from the subobj */
- req->rq_md->lsm_object_id = loi->loi_id;
- req->rq_md->lsm_stripe_count = 0;
+ req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+ req->rq_oi.oi_md->lsm_stripe_count = 0;
lov_set_add_req(req, set);
}
{
struct lov_obd *lov = &exp->exp_obd->u.lov;
struct obd_trans_info *oti = set->set_oti;
- struct obdo *src_oa = set->set_oa;
+ struct obdo *src_oa = set->set_oi->oi_oa;
struct lov_request *req;
struct obdo *ret_oa = NULL;
int attrset = 0, rc = 0;
list_for_each_entry (req, &set->set_list, rq_link) {
if (req->rq_rc == 0)
continue;
-
+
set->set_completes--;
req->rq_complete = 0;
-
+
rc = qos_remedy_create(set, req);
lov_update_create_set(set, req, rc);
/* no successful creates */
if (set->set_success == 0)
GOTO(cleanup, rc);
-
+
/* If there was an explicit stripe set, fail. Otherwise, we
* got some objects and that's not bad. */
if (set->set_count != set->set_success) {
list_for_each_entry(req, &set->set_list, rq_link) {
if (!req->rq_complete || req->rq_rc)
continue;
- lov_merge_attrs(ret_oa, req->rq_oa, req->rq_oa->o_valid,
- set->set_md, req->rq_stripe, &attrset);
+ lov_merge_attrs(ret_oa, req->rq_oi.oi_oa,
+ req->rq_oi.oi_oa->o_valid, set->set_oi->oi_md,
+ req->rq_stripe, &attrset);
}
if (src_oa->o_valid & OBD_MD_FLSIZE &&
ret_oa->o_size != src_oa->o_size) {
memcpy(src_oa, ret_oa, sizeof(*src_oa));
obdo_free(ret_oa);
- *lsmp = set->set_md;
+ *lsmp = set->set_oi->oi_md;
GOTO(done, rc = 0);
cleanup:
continue;
sub_exp = lov->tgts[req->rq_idx].ltd_exp;
- err = obd_destroy(sub_exp, req->rq_oa, NULL, oti, NULL);
+ err = obd_destroy(sub_exp, req->rq_oi.oi_oa, NULL, oti, NULL);
if (err)
CERROR("Failed to uncreate objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
- set->set_oa->o_id, req->rq_oa->o_id,
+ src_oa->o_id, req->rq_oi.oi_oa->o_id,
req->rq_idx, rc);
}
if (*lsmp == NULL)
- obd_free_memmd(exp, &set->set_md);
+ obd_free_memmd(exp, &set->set_oi->oi_md);
done:
if (oti && set->set_cookies) {
oti->oti_logcookies = set->set_cookies;
struct lov_request *req, int rc)
{
struct obd_trans_info *oti = set->set_oti;
- struct lov_stripe_md *lsm = set->set_md;
+ struct lov_stripe_md *lsm = set->set_oi->oi_md;
struct lov_oinfo *loi;
struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
ENTRY;
if (rc && lov->tgts[req->rq_idx].ltd_active) {
CERROR("error creating fid "LPX64" sub-object"
" on OST idx %d/%d: rc = %d\n",
- set->set_oa->o_id, req->rq_idx,
+ set->set_oi->oi_oa->o_id, req->rq_idx,
lsm->lsm_stripe_count, rc);
if (rc > 0) {
CERROR("obd_create returned invalid err %d\n", rc);
RETURN(rc);
if (oti && oti->oti_objid)
- oti->oti_objid[req->rq_idx] = req->rq_oa->o_id;
+ oti->oti_objid[req->rq_idx] = req->rq_oi.oi_oa->o_id;
- loi->loi_id = req->rq_oa->o_id;
+ loi->loi_id = req->rq_oi.oi_oa->o_id;
loi->loi_ost_idx = req->rq_idx;
CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64"/"LPX64" at idx %d\n",
lsm->lsm_object_id, loi->loi_id, loi->loi_id, req->rq_idx);
if (oti && set->set_cookies)
++oti->oti_logcookies;
- if (req->rq_oa->o_valid & OBD_MD_FLCOOKIE)
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCOOKIE)
set->set_cookie_sent++;
RETURN(0);
}
-int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp,
- struct obdo *src_oa, struct obd_trans_info *oti,
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md **lsmp, struct obdo *src_oa,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = *lsmp;
- set->set_oa = src_oa;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = *lsmp;
+ set->set_oi->oi_oa = src_oa;
set->set_oti = oti;
rc = qos_prep_create(exp, set);
int rc = 0, attrset = 0;
ENTRY;
- if (set->set_oa == NULL)
+ LASSERT(set->set_oi != NULL);
+
+ if (set->set_oi->oi_oa == NULL)
RETURN(0);
if (!set->set_success)
if (!req->rq_complete || req->rq_rc)
continue;
- if (req->rq_oa->o_valid == 0) /* inactive stripe */
+ if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */
continue;
- lov_merge_attrs(tmp_oa, req->rq_oa, req->rq_oa->o_valid,
- set->set_md, req->rq_stripe, &attrset);
+ lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+ req->rq_oi.oi_oa->o_valid,
+ set->set_oi->oi_md, req->rq_stripe, &attrset);
}
if (!attrset) {
CERROR("No stripes had valid attrs\n");
rc = -EIO;
}
- tmp_oa->o_id = set->set_oa->o_id;
- memcpy(set->set_oa, tmp_oa, sizeof(*set->set_oa));
+ tmp_oa->o_id = set->set_oi->oi_oa->o_id;
+ memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
out:
if (tmp_oa)
obdo_free(tmp_oa);
static int brw_done(struct lov_request_set *set)
{
- struct lov_stripe_md *lsm = set->set_md;
+ struct lov_stripe_md *lsm = set->set_oi->oi_md;
struct lov_oinfo *loi = NULL;
struct list_head *pos;
struct lov_request *req;
loi = &lsm->lsm_oinfo[req->rq_stripe];
- if (req->rq_oa->o_valid & OBD_MD_FLBLOCKS)
- loi->loi_lvb.lvb_blocks = req->rq_oa->o_blocks;
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS)
+ loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks;
}
RETURN(0);
RETURN(rc);
}
-int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_count oa_bufs,
- struct brw_page *pga, struct obd_trans_info *oti,
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pga,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
struct {
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
set->set_oti = oti;
+ set->set_oi = oinfo;
set->set_oabufs = oa_bufs;
OBD_ALLOC(set->set_pga, oa_bufs * sizeof(*set->set_pga));
if (!set->set_pga)
GOTO(out, rc = -ENOMEM);
- OBD_ALLOC(info, sizeof(*info) * lsm->lsm_stripe_count);
+ OBD_ALLOC(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
if (!info)
GOTO(out, rc = -ENOMEM);
/* calculate the page count for each stripe */
for (i = 0; i < oa_bufs; i++) {
- int stripe = lov_stripe_number(lsm, pga[i].off);
+ int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
info[stripe].count++;
}
/* alloc and initialize lov request */
shift = 0;
- for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, loi++){
+ for (i = 0, loi = oinfo->oi_md->lsm_oinfo;
+ i < oinfo->oi_md->lsm_stripe_count; i++, loi++){
struct lov_request *req;
if (info[i].count == 0)
if (req == NULL)
GOTO(out, rc = -ENOMEM);
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out, rc = -ENOMEM);
}
- if (src_oa)
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
- req->rq_oa->o_stripe_idx = i;
+ if (oinfo->oi_oa) {
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ }
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
+ req->rq_oi.oi_oa->o_stripe_idx = i;
- req->rq_buflen = sizeof(*req->rq_md);
- OBD_ALLOC(req->rq_md, req->rq_buflen);
- if (req->rq_md == NULL) {
- obdo_free(req->rq_oa);
+ req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+ OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_md == NULL) {
+ obdo_free(req->rq_oi.oi_oa);
OBD_FREE(req, sizeof(*req));
GOTO(out, rc = -ENOMEM);
}
req->rq_stripe = i;
/* XXX LOV STACKING */
- req->rq_md->lsm_object_id = loi->loi_id;
- req->rq_md->lsm_object_gr = lsm->lsm_object_gr;
+ req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+ req->rq_oi.oi_md->lsm_object_gr = oinfo->oi_md->lsm_object_gr;
req->rq_oabufs = info[i].count;
req->rq_pgaidx = shift;
shift += req->rq_oabufs;
/* rotate & sort the brw_page array */
for (i = 0; i < oa_bufs; i++) {
- int stripe = lov_stripe_number(lsm, pga[i].off);
+ int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
shift = info[stripe].index + info[stripe].off;
LASSERT(shift < oa_bufs);
set->set_pga[shift] = pga[i];
- lov_stripe_offset(lsm, pga[i].off, stripe,
+ lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe,
&set->set_pga[shift].off);
info[stripe].off++;
}
out:
if (info)
- OBD_FREE(info, sizeof(*info) * lsm->lsm_stripe_count);
+ OBD_FREE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
if (rc == 0)
*reqset = set;
RETURN(rc);
}
-int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm,
+/* The callback for osc_getattr_async that finilizes a request info when a
+ * response is recieved. */
+static int cb_getattr_update(struct obd_info *oinfo, int rc)
+{
+ struct lov_request *lovreq;
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
+ set->set_oi = oinfo;
- loi = lsm->lsm_oinfo;
- for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
+ loi = oinfo->oi_md->lsm_oinfo;
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) {
struct lov_request *req;
if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) {
req->rq_stripe = i;
req->rq_idx = loi->loi_ost_idx;
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
+ req->rq_oi.oi_cb_up = cb_getattr_update;
lov_set_add_req(req, set);
}
RETURN(0);
}
-int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm,
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obdo *src_oa, struct lov_stripe_md *lsm,
struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
struct lov_oinfo *loi = NULL;
struct lov_obd *lov = &exp->exp_obd->u.lov;
- int rc = 0, cookie_set = 0, i;
+ int rc = 0, i;
ENTRY;
OBD_ALLOC(set, sizeof(*set));
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = lsm;
+ set->set_oi->oi_oa = src_oa;
set->set_oti = oti;
if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
set->set_cookies = oti->oti_logcookies;
req->rq_stripe = i;
req->rq_idx = loi->loi_ost_idx;
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
-
- /* Setup the first request's cookie position */
- if (oti && !cookie_set && set->set_cookies) {
- oti->oti_logcookies = set->set_cookies + i;
- cookie_set = 1;
- }
+ memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
lov_set_add_req(req, set);
}
if (!set->set_count)
RETURN(rc);
}
-int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+int lov_update_setattr_set(struct lov_request_set *set,
+ struct lov_request *req, int rc)
+{
+ struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+ struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+ ENTRY;
+
+ lov_update_set(set, req, rc);
+
+ /* grace error on inactive ost */
+ if (rc && !lov->tgts[req->rq_idx].ltd_active)
+ rc = 0;
+
+ if (rc == 0) {
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+ lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_ctime =
+ req->rq_oi.oi_oa->o_ctime;
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+ lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_mtime =
+ req->rq_oi.oi_oa->o_mtime;
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+ lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_atime =
+ req->rq_oi.oi_oa->o_atime;
+ }
+
+ RETURN(rc);
+}
+
+/* The callback for osc_setattr_async that finilizes a request info when a
+ * response is recieved. */
+static int cb_setattr_update(struct obd_info *oinfo, int rc)
+{
+ struct lov_request *lovreq;
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
+ set->set_oti = oti;
+ set->set_oi = oinfo;
+ if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+ set->set_cookies = oti->oti_logcookies;
- loi = lsm->lsm_oinfo;
- for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
+ loi = oinfo->oi_md->lsm_oinfo;
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) {
struct lov_request *req;
if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) {
req->rq_stripe = i;
req->rq_idx = loi->loi_ost_idx;
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
- req->rq_oa->o_stripe_idx = i;
-
- if (src_oa->o_valid & OBD_MD_FLSIZE) {
- if (lov_stripe_offset(lsm, src_oa->o_size, i,
- &req->rq_oa->o_size) < 0 &&
- req->rq_oa->o_size)
- req->rq_oa->o_size--;
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
+ req->rq_oi.oi_oa->o_stripe_idx = i;
+ req->rq_oi.oi_cb_up = cb_setattr_update;
+ req->rq_rqset = set;
+
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+ int off = lov_stripe_offset(oinfo->oi_md,
+ oinfo->oi_oa->o_size, i,
+ &req->rq_oi.oi_oa->o_size);
+
+ if (off < 0 && req->rq_oi.oi_oa->o_size)
+ req->rq_oi.oi_oa->o_size--;
+
CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
- i, req->rq_oa->o_size, src_oa->o_size);
+ i, req->rq_oi.oi_oa->o_size,
+ oinfo->oi_oa->o_size);
}
lov_set_add_req(req, set);
}
RETURN(rc);
}
-int lov_update_setattr_set(struct lov_request_set *set,
- struct lov_request *req, int rc)
-{
- struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
- struct lov_stripe_md *lsm = set->set_md;
- ENTRY;
-
- lov_update_set(set, req, rc);
-
- /* grace error on inactive ost */
- if (rc && !lov->tgts[req->rq_idx].ltd_active)
- rc = 0;
-
- /* FIXME: LOV STACKING update loi data should be done by OSC *
- * when this is gone we can go back to using lov_update_common_set() */
- if (rc == 0) {
- if (req->rq_oa->o_valid & OBD_MD_FLMTIME)
- lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_ctime =
- req->rq_oa->o_ctime;
- if (req->rq_oa->o_valid & OBD_MD_FLMTIME)
- lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_mtime =
- req->rq_oa->o_mtime;
- if (req->rq_oa->o_valid & OBD_MD_FLATIME)
- lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_atime =
- req->rq_oa->o_atime;
- }
-
- RETURN(rc);
-}
-
-int lov_update_punch_set(struct lov_request_set *set, struct lov_request *req,
- int rc)
-{
- struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
- ENTRY;
-
- lov_update_set(set, req, rc);
- if (rc && !lov->tgts[req->rq_idx].ltd_active)
- rc = 0;
- /* FIXME in raid1 regime, should return 0 */
- RETURN(rc);
-}
-
int lov_fini_punch_set(struct lov_request_set *set)
{
int rc = 0;
RETURN(rc);
}
-int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_off start,
- obd_off end, struct obd_trans_info *oti,
+/* The callback for osc_punch that finilizes a request info when a response
+ * is recieved. */
+static int cb_update_punch(struct obd_info *oinfo, int rc)
+{
+ struct lov_request *lovreq;
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
struct lov_request_set *set;
RETURN(-ENOMEM);
lov_init_set(set);
+ set->set_oi = oinfo;
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
- loi = lsm->lsm_oinfo;
- for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
+ loi = oinfo->oi_md->lsm_oinfo;
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) {
struct lov_request *req;
obd_off rs, re;
continue;
}
- if (!lov_stripe_intersects(lsm, i, start, end, &rs, &re))
+ if (!lov_stripe_intersects(oinfo->oi_md, i,
+ oinfo->oi_policy.l_extent.start,
+ oinfo->oi_policy.l_extent.end,
+ &rs, &re))
continue;
OBD_ALLOC(req, sizeof(*req));
req->rq_stripe = i;
req->rq_idx = loi->loi_ost_idx;
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
- req->rq_oa->o_stripe_idx = i;
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
+ req->rq_oi.oi_oa->o_stripe_idx = i;
+ req->rq_oi.oi_cb_up = cb_update_punch;
+ req->rq_rqset = set;
- req->rq_extent.start = rs;
- req->rq_extent.end = re;
- req->rq_extent.gid = -1;
+ req->rq_oi.oi_policy.l_extent.start = rs;
+ req->rq_oi.oi_policy.l_extent.end = re;
+ req->rq_oi.oi_policy.l_extent.gid = -1;
lov_set_add_req(req, set);
}
RETURN(rc);
}
-int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md *lsm, obd_off start,
- obd_off end, struct lov_request_set **reqset)
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obdo *src_oa, struct lov_stripe_md *lsm,
+ obd_off start, obd_off end,
+ struct lov_request_set **reqset)
{
struct lov_request_set *set;
struct lov_oinfo *loi = NULL;
lov_init_set(set);
set->set_exp = exp;
- set->set_md = lsm;
- set->set_oa = src_oa;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = lsm;
+ set->set_oi->oi_oa = src_oa;
loi = lsm->lsm_oinfo;
for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) {
req->rq_stripe = i;
req->rq_idx = loi->loi_ost_idx;
- req->rq_oa = obdo_alloc();
- if (req->rq_oa == NULL) {
+ req->rq_oi.oi_oa = obdo_alloc();
+ if (req->rq_oi.oi_oa == NULL) {
OBD_FREE(req, sizeof(*req));
GOTO(out_set, rc = -ENOMEM);
}
- memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
- req->rq_oa->o_id = loi->loi_id;
- req->rq_oa->o_stripe_idx = i;
+ memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_id = loi->loi_id;
+ req->rq_oi.oi_oa->o_stripe_idx = i;
- req->rq_extent.start = rs;
- req->rq_extent.end = re;
- req->rq_extent.gid = -1;
+ req->rq_oi.oi_policy.l_extent.start = rs;
+ req->rq_oi.oi_policy.l_extent.end = re;
+ req->rq_oi.oi_policy.l_extent.gid = -1;
lov_set_add_req(req, set);
}
lov_fini_sync_set(set);
RETURN(rc);
}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add) \
+ do { \
+ if ((tot) + (add) < (tot)) \
+ (tot) = LOV_U64_MAX; \
+ else \
+ (tot) += (add); \
+ } while(0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
+{
+ ENTRY;
+
+ if (success) {
+ __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, 0);
+
+ if (osfs->os_files != LOV_U64_MAX)
+ do_div(osfs->os_files, expected_stripes);
+ if (osfs->os_ffree != LOV_U64_MAX)
+ do_div(osfs->os_ffree, expected_stripes);
+
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(&obd->obd_osfs, osfs, sizeof(osfs));
+ obd->obd_osfs_age = jiffies;
+ spin_unlock(&obd->obd_osfs_lock);
+ RETURN(0);
+ }
+
+ RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (set == NULL)
+ RETURN(0);
+
+ if (set->set_completes) {
+ rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+ set->set_success);
+ }
+
+ if (atomic_dec_and_test(&set->set_refcount))
+ lov_finish_set(set);
+
+ RETURN(rc);
+}
+
+void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+ struct obd_statfs *lov_sfs, int success)
+{
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(&obd->obd_osfs, lov_sfs, sizeof(osfs));
+ obd->obd_osfs_age = jiffies;
+ spin_unlock(&obd->obd_osfs_lock);
+
+ if (success == 0) {
+ memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+ } else {
+#ifdef MIN_DF
+ /* Sandia requested that df (and so, statfs) only
+ returned minimal available space on
+ a single OST, so people would be able to
+ write this much data guaranteed. */
+ if (osfs->os_bavail > lov_sfs->os_bavail) {
+ /* Presumably if new bavail is smaller,
+ new bfree is bigger as well */
+ osfs->os_bfree = lov_sfs->os_bfree;
+ osfs->os_bavail = lov_sfs->os_bavail;
+ }
+#else
+ osfs->os_bfree += lov_sfs->os_bfree;
+ osfs->os_bavail += lov_sfs->os_bavail;
+#endif
+ osfs->os_blocks += lov_sfs->os_blocks;
+ /* XXX not sure about this one - depends on policy.
+ * - could be minimum if we always stripe on all OBDs
+ * (but that would be wrong for any other policy,
+ * if one of the OBDs has no more objects left)
+ * - could be sum if we stripe whole objects
+ * - could be average, just to give a nice number
+ *
+ * To give a "reasonable" (if not wholly accurate)
+ * number, we divide the total number of free objects
+ * by expected stripe count (watch out for overflow).
+ */
+ LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+ LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+ }
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is recieved. */
+static int cb_statfs_update(struct obd_info *oinfo, int rc)
+{
+ struct lov_request *lovreq;
+ struct obd_statfs *osfs, *lov_sfs;
+ struct obd_device *obd;
+ struct lov_obd *lov;
+ int success;
+ ENTRY;
+
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ lov = &lovreq->rq_rqset->set_obd->u.lov;
+ obd = class_exp2obd(lov->tgts[lovreq->rq_idx].ltd_exp);
+
+ osfs = lovreq->rq_rqset->set_oi->oi_osfs;
+ lov_sfs = oinfo->oi_osfs;
+
+ success = lovreq->rq_rqset->set_success;
+
+ /* XXX: the same is done in lov_update_common_set, however
+ lovset->set_exp is not initialized. */
+ lov_update_set(lovreq->rq_rqset, lovreq, rc);
+ if (rc) {
+ if (rc && !lov->tgts[lovreq->rq_idx].ltd_active)
+ rc = 0;
+ RETURN(rc);
+ }
+
+ lov_update_statfs(obd, osfs, lov_sfs, success);
+ RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+ struct lov_request_set **reqset)
+{
+ struct lov_request_set *set;
+ struct lov_obd *lov = &obd->u.lov;
+ int rc = 0, i;
+ ENTRY;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (set == NULL)
+ RETURN(-ENOMEM);
+ lov_init_set(set);
+
+ set->set_obd = obd;
+ set->set_oi = oinfo;
+
+ /* We only get block data from the OBD */
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ struct lov_request *req;
+
+ if (lov->tgts[i].ltd_active == 0) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", i);
+ continue;
+ }
+
+ OBD_ALLOC(req, sizeof(*req));
+ if (req == NULL)
+ GOTO(out_set, rc = -ENOMEM);
+
+ OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+ if (req->rq_oi.oi_osfs == NULL) {
+ OBD_FREE(req, sizeof(*req));
+ GOTO(out_set, rc = -ENOMEM);
+ }
+
+ req->rq_idx = i;
+ req->rq_oi.oi_cb_up = cb_statfs_update;
+ req->rq_rqset = set;
+
+ lov_set_add_req(req, set);
+ }
+ if (!set->set_count)
+ GOTO(out_set, rc = -EIO);
+ *reqset = set;
+ RETURN(rc);
+out_set:
+ lov_fini_statfs_set(set);
+ RETURN(rc);
+}
#include <linux/iobuf.h>
#endif
#include <linux/lustre_compat25.h>
+#include <linux/lprocfs_status.h>
#ifdef EXT3_MULTIBLOCK_ALLOCATOR
#include <linux/ext3_extents.h>
struct iattr *iattr, int do_trunc)
{
struct inode *inode = dentry->d_inode;
- int rc;
+ int rc = 0;
lock_kernel();
iattr->ia_valid &= ~ATTR_SIZE;
EXT3_I(inode)->i_disksize = inode->i_size = iattr->ia_size;
- /* make sure _something_ gets set - so new inode
- * goes to disk (probably won't work over XFS */
- if (!(iattr->ia_valid & (ATTR_MODE | ATTR_MTIME | ATTR_CTIME))){
- iattr->ia_valid |= ATTR_MTIME;
- iattr->ia_mtime = inode->i_mtime;
+ if (iattr->ia_valid & ATTR_UID)
+ inode->i_uid = iattr->ia_uid;
+ if (iattr->ia_valid & ATTR_GID)
+ inode->i_gid = iattr->ia_gid;
+ if (iattr->ia_valid & ATTR_ATIME)
+ inode->i_atime = iattr->ia_atime;
+ if (iattr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = iattr->ia_mtime;
+ if (iattr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = iattr->ia_ctime;
+ if (iattr->ia_valid & ATTR_MODE) {
+ inode->i_mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ inode->i_mode &= ~S_ISGID;
}
+
+ inode->i_sb->s_op->dirty_inode(inode);
+
+ goto out;
}
/* Don't allow setattr to change file type */
rc = inode_setattr(inode, iattr);
}
+ out:
unlock_kernel();
- return rc;
+ RETURN(rc);
}
static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file,
ptlrpc_req_set_repsize(req, repbufcnt, repsize);
mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
- rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, res_id,
- lock_type, &policy,lock_mode, &flags, cb_blocking,
- cb_completion, NULL, cb_data, NULL, 0, NULL,
- lockh);
+ rc = ldlm_cli_enqueue(exp, &req, res_id, lock_type, &policy,
+ lock_mode, &flags, cb_blocking, cb_completion,
+ NULL, cb_data, NULL, 0, NULL, lockh, 0);
mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
/* Similarly, if we're going to replay this request, we don't want to
res_id.name[0] = de->d_inode->i_ino;
res_id.name[1] = de->d_inode->i_generation;
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
- LDLM_IBITS, &policy, lock_mode, &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id,
+ LDLM_IBITS, &policy, lock_mode, &flags,
+ ldlm_blocking_ast, ldlm_completion_ast,
+ NULL, NULL, 0, NULL, lockh);
if (rc != ELDLM_OK) {
l_dput(de);
retval = ERR_PTR(-EIO); /* XXX translate ldlm code */
RETURN(rc);
}
-static
-void fsoptions_to_mds_flags(struct mds_obd *mds, char *options)
+static void fsoptions_to_mds_flags(struct mds_obd *mds, char *options)
{
char *p = options;
memcmp(options, "user_xattr", len) == 0) {
mds->mds_fl_user_xattr = 1;
} else if (len == sizeof("nouser_xattr") - 1 &&
- memcmp(options, "nouser_xattr", len) == 0) {
+ memcmp(options, "nouser_xattr", len) == 0) {
mds->mds_fl_user_xattr = 0;
} else if (len == sizeof("acl") - 1 &&
- memcmp(options, "acl", len) == 0) {
+ memcmp(options, "acl", len) == 0) {
#ifdef CONFIG_FS_POSIX_ACL
mds->mds_fl_acl = 1;
#else
CWARN("ignoring unsupported acl mount option\n");
memmove(options, p, strlen(p) + 1);
+ p = options;
#endif
} else if (len == sizeof("noacl") - 1 &&
- memcmp(options, "noacl", len) == 0) {
+ memcmp(options, "noacl", len) == 0) {
#ifdef CONFIG_FS_POSIX_ACL
mds->mds_fl_acl = 0;
#else
memmove(options, p, strlen(p) + 1);
+ p = options;
#endif
}
{
struct inode *inode = dchild->d_inode;
struct obd_trans_info oti = { 0 };
- struct lov_stripe_md *lsm = NULL;
struct lov_mds_md *lmm = NULL;
int rc, lmm_size;
struct mds_body *body;
- struct obdo *oa;
+ struct obd_info oinfo = { { { 0 } } };
void *lmm_buf;
ENTRY;
if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_ALLOC_OBDO))
GOTO(out_ids, rc = -ENOMEM);
- oa = obdo_alloc();
- if (oa == NULL)
+ oinfo.oi_oa = obdo_alloc();
+ if (oinfo.oi_oa == NULL)
GOTO(out_ids, rc = -ENOMEM);
- oa->o_uid = 0; /* must have 0 uid / gid on OST */
- oa->o_gid = 0;
- oa->o_mode = S_IFREG | 0600;
- oa->o_id = inode->i_ino;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS |
+ oinfo.oi_oa->o_uid = 0; /* must have 0 uid / gid on OST */
+ oinfo.oi_oa->o_gid = 0;
+ oinfo.oi_oa->o_mode = S_IFREG | 0600;
+ oinfo.oi_oa->o_id = inode->i_ino;
+ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS |
OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
- oa->o_size = 0;
+ oinfo.oi_oa->o_size = 0;
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+ obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
OBD_MD_FLMTIME | OBD_MD_FLCTIME);
if (!(rec->ur_flags & MDS_OPEN_HAS_OBJS)) {
if (rec->ur_flags & MDS_OPEN_HAS_EA) {
rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
mds->mds_osc_exp,
- 0, &lsm, rec->ur_eadata);
+ 0, &oinfo.oi_md, rec->ur_eadata);
if (rc)
GOTO(out_oa, rc);
} else {
if (rc > 0)
rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
mds->mds_osc_exp,
- 0, &lsm, lmm);
+ 0, &oinfo.oi_md, lmm);
OBD_FREE(lmm, mds->mds_max_mdsize);
if (rc)
GOTO(out_oa, rc);
}
- rc = obd_create(mds->mds_osc_exp, oa, &lsm, &oti);
+ rc = obd_create(mds->mds_osc_exp, oinfo.oi_oa,
+ &oinfo.oi_md, &oti);
if (rc) {
int level = D_ERROR;
if (rc == -ENOSPC)
}
} else {
rc = obd_iocontrol(OBD_IOC_LOV_SETEA, mds->mds_osc_exp,
- 0, &lsm, rec->ur_eadata);
+ 0, &oinfo.oi_md, rec->ur_eadata);
if (rc) {
GOTO(out_oa, rc);
}
- lsm->lsm_object_id = oa->o_id;
+ oinfo.oi_md->lsm_object_id = oinfo.oi_oa->o_id;
}
if (inode->i_size) {
- oa->o_size = inode->i_size;
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
- OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLSIZE);
+ oinfo.oi_oa->o_size = inode->i_size;
+ obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLTYPE |
+ OBD_MD_FLATIME | OBD_MD_FLMTIME |
+ OBD_MD_FLCTIME | OBD_MD_FLSIZE);
/* pack lustre id to OST */
- oa->o_fid = body->fid1.id;
- oa->o_generation = body->fid1.generation;
- oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
+ oinfo.oi_oa->o_fid = body->fid1.id;
+ oinfo.oi_oa->o_generation = body->fid1.generation;
+ oinfo.oi_oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
- rc = obd_setattr(mds->mds_osc_exp, oa, lsm, &oti);
+ rc = obd_setattr_rqset(mds->mds_osc_exp, &oinfo, &oti);
if (rc) {
CERROR("error setting attrs for inode %lu: rc %d\n",
inode->i_ino, rc);
if (rc > 0) {
- CERROR("obd_setattr returned bad rc %d\n", rc);
+ CERROR("obd_setattr_async returned bad rc %d\n",
+ rc);
rc = -EIO;
}
GOTO(out_oa, rc);
}
body->valid |= OBD_MD_FLBLKSZ | OBD_MD_FLEASIZE;
- obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
+ obdo_refresh_inode(inode, oinfo.oi_oa, OBD_MD_FLBLKSZ);
- LASSERT(lsm && lsm->lsm_object_id);
+ LASSERT(oinfo.oi_md && oinfo.oi_md->lsm_object_id);
lmm = NULL;
- rc = obd_packmd(mds->mds_osc_exp, &lmm, lsm);
+ rc = obd_packmd(mds->mds_osc_exp, &lmm, oinfo.oi_md);
if (rc < 0) {
CERROR("cannot pack lsm, err = %d\n", rc);
GOTO(out_oa, rc);
obd_free_diskmd(mds->mds_osc_exp, &lmm);
out_oa:
oti_free_cookies(&oti);
- obdo_free(oa);
+ obdo_free(oinfo.oi_oa);
out_ids:
if (rc) {
OBD_FREE(*ids, mds->mds_lov_desc.ld_tgt_count * sizeof(**ids));
*ids = NULL;
}
- if (lsm)
- obd_free_memmd(mds->mds_osc_exp, &lsm);
+ if (oinfo.oi_md)
+ obd_free_memmd(mds->mds_osc_exp, &oinfo.oi_md);
RETURN(rc);
}
* Now that exp_outstanding_reply is a list, it's just using mfd != NULL
* to detect a re-open */
if (mfd == NULL) {
- if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
- rc = mds_join_file(rec, req, dchild, NULL);
- if (rc)
- GOTO(out_dput, rc);
- }
+ if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+ rc = mds_join_file(rec, req, dchild, NULL);
+ if (rc)
+ GOTO(out_dput, rc);
+ }
mntget(mds->mds_vfsmnt);
CERROR("Re-opened file \n");
mfd = mds_dentry_open(dchild, mds->mds_vfsmnt,
if (child_lockh == NULL)
child_lockh = &lockh;
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, child_res_id,
- LDLM_PLAIN, NULL, LCK_EX, &lock_flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, child_lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, child_res_id,
+ LDLM_PLAIN, NULL, LCK_EX, &lock_flags,
+ ldlm_blocking_ast, ldlm_completion_ast,
+ NULL, NULL, 0, NULL, child_lockh);
if (rc != ELDLM_OK)
- CERROR("ldlm_cli_enqueue: %d\n", rc);
+ CERROR("ldlm_cli_enqueue_local: %d\n", rc);
else if (child_lockh == &lockh)
ldlm_lock_decref(child_lockh, LCK_EX);
child_res_id.name[0] = dchild->d_inode->i_ino;
child_res_id.name[1] = dchild->d_inode->i_generation;
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- child_res_id, LDLM_IBITS, &policy,
- child_mode, &lock_flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, child_lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, child_res_id,
+ LDLM_IBITS, &policy, child_mode,
+ &lock_flags, ldlm_blocking_ast,
+ ldlm_completion_ast, NULL, NULL,
+ 0, NULL, child_lockh);
if (rc != ELDLM_OK)
GOTO(cleanup, rc);
struct llog_cookie *logcookies, struct ll_fid *fid)
{
struct mds_obd *mds = &obd->u.mds;
- struct lov_stripe_md *lsm = NULL;
struct obd_trans_info oti = { 0 };
- struct obdo *oa = NULL;
+ struct obd_info oinfo = { { { 0 } } };
int rc;
ENTRY;
RETURN(0);
/* first get memory EA */
- oa = obdo_alloc();
- if (!oa)
+ oinfo.oi_oa = obdo_alloc();
+ if (!oinfo.oi_oa)
RETURN(-ENOMEM);
LASSERT(lmm);
- rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size);
+ rc = obd_unpackmd(mds->mds_osc_exp, &oinfo.oi_md, lmm, lmm_size);
if (rc < 0) {
CERROR("Error unpack md %p for inode %lu\n", lmm, inode->i_ino);
GOTO(out, rc);
}
- rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+ rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, oinfo.oi_md);
if (rc) {
- CERROR("Error revalidate lsm %p \n", lsm);
+ CERROR("Error revalidate lsm %p \n", oinfo.oi_md);
GOTO(out, rc);
}
/* then fill oa */
- oa->o_id = lsm->lsm_object_id;
- oa->o_uid = inode->i_uid;
- oa->o_gid = inode->i_gid;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID;
+ oinfo.oi_oa->o_id = oinfo.oi_md->lsm_object_id;
+ oinfo.oi_oa->o_uid = inode->i_uid;
+ oinfo.oi_oa->o_gid = inode->i_gid;
+ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID;
if (logcookies) {
- oa->o_valid |= OBD_MD_FLCOOKIE;
+ oinfo.oi_oa->o_valid |= OBD_MD_FLCOOKIE;
oti.oti_logcookies = logcookies;
}
LASSERT(fid != NULL);
- oa->o_fid = fid->id;
- oa->o_generation = fid->generation;
- oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
+ oinfo.oi_oa->o_fid = fid->id;
+ oinfo.oi_oa->o_generation = fid->generation;
+ oinfo.oi_oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
- /* do setattr from mds to ost asynchronously */
- rc = obd_setattr_async(mds->mds_osc_exp, oa, lsm, &oti);
+ /* do async setattr from mds to ost not waiting for responses. */
+ rc = obd_setattr_async(mds->mds_osc_exp, &oinfo, &oti, NULL);
if (rc)
CDEBUG(D_INODE, "mds to ost setattr objid 0x"LPX64
- " on ost error %d\n", lsm->lsm_object_id, rc);
+ " on ost error %d\n", oinfo.oi_md->lsm_object_id, rc);
out:
- if (lsm)
- obd_free_memmd(mds->mds_osc_exp, &lsm);
- obdo_free(oa);
+ if (oinfo.oi_md)
+ obd_free_memmd(mds->mds_osc_exp, &oinfo.oi_md);
+ obdo_free(oinfo.oi_oa);
RETURN(rc);
}
res_id[0]->name[0], res_id[1]->name[0]);
flags = LDLM_FL_LOCAL_ONLY;
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, *res_id[0],
- LDLM_IBITS, policies[0], lock_modes[0], &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, handles[0]);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, *res_id[0],
+ LDLM_IBITS, policies[0], lock_modes[0],
+ &flags, ldlm_blocking_ast,
+ ldlm_completion_ast, NULL, NULL, 0,
+ NULL, handles[0]);
if (rc != ELDLM_OK)
RETURN(-EIO);
ldlm_lock_dump_handle(D_OTHER, handles[0]);
ldlm_lock_addref(handles[1], lock_modes[1]);
} else if (res_id[1]->name[0] != 0) {
flags = LDLM_FL_LOCAL_ONLY;
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *res_id[1], LDLM_IBITS, policies[1],
- lock_modes[1], &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, handles[1]);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, *res_id[1],
+ LDLM_IBITS, policies[1],
+ lock_modes[1], &flags,
+ ldlm_blocking_ast,
+ ldlm_completion_ast, NULL, NULL,
+ 0, NULL, handles[1]);
if (rc != ELDLM_OK) {
ldlm_lock_decref(handles[0], lock_modes[0]);
RETURN(-EIO);
if (i < 3)
try_to_aggregate_locks(res_id[i], policies[i],
res_id[i+1], policies[i+1]);
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *res_id[i], LDLM_IBITS,
- policies[i],
- lock_modes[i], &flags,
- ldlm_blocking_ast,
- ldlm_completion_ast, NULL, NULL,
- NULL, 0, NULL, dlm_handles[i]);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace,
+ *res_id[i], LDLM_IBITS,
+ policies[i], lock_modes[i],
+ &flags, ldlm_blocking_ast,
+ ldlm_completion_ast, NULL,
+ NULL, 0, NULL,
+ dlm_handles[i]);
if (rc != ELDLM_OK)
GOTO(out_err, rc = -EIO);
ldlm_lock_dump_handle(D_OTHER, dlm_handles[i]);
GOTO(cleanup, rc = 1);
}
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *child_res_id, LDLM_IBITS, child_policy,
- child_mode, &flags, ldlm_blocking_ast,
- ldlm_completion_ast, NULL, NULL, NULL, 0,
- NULL, child_lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, *child_res_id,
+ LDLM_IBITS, child_policy,
+ child_mode, &flags,
+ ldlm_blocking_ast,
+ ldlm_completion_ast, NULL,
+ NULL, 0, NULL, child_lockh);
if (rc != ELDLM_OK)
GOTO(cleanup, rc = -EIO);
} else {
struct lustre_handle *lockh)
{
struct config_llog_data *cld = (struct config_llog_data *)data;
- struct obd_device *obd = class_exp2obd(exp);
int rc;
ENTRY;
/* We need a callback for every lockholder, so don't try to
ldlm_lock_match (see rev 1.1.2.11.2.47) */
- rc = ldlm_cli_enqueue(exp, NULL, obd->obd_namespace, cld->cld_resid,
+ rc = ldlm_cli_enqueue(exp, NULL, cld->cld_resid,
type, NULL, mode, flags,
mgc_blocking_ast, ldlm_completion_ast, NULL,
- data, NULL, 0, NULL, lockh);
+ data, NULL, 0, NULL, lockh, 0);
RETURN(rc);
}
rc = mgc_logname2resid(fsname, &res_id);
if (!rc)
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
- LDLM_PLAIN, NULL, LCK_EX, &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- NULL, fsname, NULL, 0, NULL, lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id,
+ LDLM_PLAIN, NULL, LCK_EX,
+ &flags, ldlm_blocking_ast,
+ ldlm_completion_ast, NULL,
+ fsname, 0, NULL, lockh);
if (rc)
CERROR("can't take cfg lock for %s (%d)\n", fsname, rc);
extern struct list_head obd_types;
spinlock_t obd_types_lock;
-cfs_mem_cache_t *obdo_cachep = NULL;
+cfs_mem_cache_t *obdo_cachep;
EXPORT_SYMBOL(obdo_cachep);
-cfs_mem_cache_t *import_cachep = NULL;
+cfs_mem_cache_t *import_cachep;
int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd);
int lprocfs_write_helper(const char *buffer, unsigned long count,
int *val)
{
- char kernbuf[20], *end;
+ return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+ int *val, int mult)
+{
+ char kernbuf[20], *end, *pbuf;
if (count > (sizeof(kernbuf) - 1))
return -EINVAL;
return -EFAULT;
kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ mult = -mult;
+ pbuf++;
+ }
- *val = simple_strtol(kernbuf, &end, 0);
- if (kernbuf == end)
+ *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+ if (pbuf == end)
return -EINVAL;
+ if (end != NULL && *end == '.') {
+ int temp_val, pow = 1;
+ int i;
+
+ pbuf = end + 1;
+ if (strlen(pbuf) > 5)
+ pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+ temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+ if (pbuf < end) {
+ for (i = 0; i < (end - pbuf); i++)
+ pow *= 10;
+
+ *val += temp_val / pow;
+ }
+ }
return 0;
}
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mult)
+{
+ long decimal_val,frac_val;
+ int prtn;
+
+ if (count < 10)
+ return -EINVAL;
+
+ decimal_val =val / mult;
+ prtn = snprintf(buffer, count, "%ld", decimal_val);
+ frac_val = val % mult;
+
+ if (prtn < (count - 4) && frac_val > 0) {
+ long temp_frac;
+ int i, temp_mult = 1, frac_bits = 0;
+
+ temp_frac = frac_val * 10;
+ buffer[prtn++] = '.';
+ while (frac_bits < 2 && (temp_frac / mult) < 1 ) { /*only reserved 2bits fraction*/
+ buffer[prtn++] ='0';
+ temp_frac *= 10;
+ frac_bits++;
+ }
+ /*
+ Need to think these cases :
+ 1. #echo x.00 > /proc/xxx output result : x
+ 2. #echo x.0x > /proc/xxx output result : x.0x
+ 3. #echo x.x0 > /proc/xxx output result : x.x
+ 4. #echo x.xx > /proc/xxx output result : x.xx
+ Only reserved 2bits fraction.
+ */
+ for (i = 0; i < (5 - prtn); i++)
+ temp_mult *= 10;
+
+ frac_bits = min((int)count - prtn, 3 - frac_bits);
+ prtn += snprintf(buffer + prtn, frac_bits, "%ld", frac_val * temp_mult / mult);
+
+ prtn--;
+ while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+ prtn--;
+ if (buffer[prtn] == '.') {
+ prtn--;
+ break;
+ }
+ }
+ prtn++;
+ }
+ buffer[prtn++] ='\n';
+ return prtn;
+}
+
int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
{
- char kernbuf[22], *end;
+ return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
- if (count > (sizeof(kernbuf) - 1))
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+ __u64 *val, int mult)
+{
+ char kernbuf[22], *end, *pbuf;
+
+ if (count > (sizeof(kernbuf) - 1) )
return -EINVAL;
if (copy_from_user(kernbuf, buffer, count))
return -EFAULT;
kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ mult = -mult;
+ pbuf++;
+ }
- if (kernbuf[0] == '-')
- *val = -simple_strtoull(kernbuf + 1, &end, 0);
- else
- *val = simple_strtoull(kernbuf, &end, 0);
- if (kernbuf == end)
+ *val = simple_strtoull(pbuf, &end, 10) * mult;
+ if (pbuf == end)
return -EINVAL;
+ if (end != NULL && *end == '.') {
+ int temp_val;
+ int i, pow = 1;
+
+ pbuf = end + 1;
+ if (strlen(pbuf) > 10)
+ pbuf[10] = '\0';
+
+ temp_val = (int)simple_strtoull(pbuf, &end, 10) * mult;
+
+ if (pbuf < end) {
+ for (i = 0; i < (end - pbuf); i++)
+ pow *= 10;
+
+ *val += (__u64)(temp_val / pow);
+ }
+ }
return 0;
}
EXPORT_SYMBOL(lprocfs_rd_filesfree);
EXPORT_SYMBOL(lprocfs_write_helper);
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
EXPORT_SYMBOL(lprocfs_write_u64_helper);
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
#endif /* LPROCFS*/
CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
obd_timeout, lcfg->lcfg_num);
obd_timeout = max(lcfg->lcfg_num, 1U);
- if (ldlm_timeout >= obd_timeout)
- ldlm_timeout = max(obd_timeout / 3, 1U);
GOTO(out, err = 0);
}
case LCFG_SET_UPCALL: {
RETURN(0);
}
-static int echo_getattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md)
+static int echo_getattr(struct obd_export *exp, struct obd_info *oinfo)
{
struct obd_device *obd = class_exp2obd(exp);
- obd_id id = oa->o_id;
+ obd_id id = oinfo->oi_oa->o_id;
ENTRY;
if (!obd) {
RETURN(-EINVAL);
}
- if (!(oa->o_valid & OBD_MD_FLID)) {
- CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+ if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+ CERROR("obdo missing FLID valid flag: "LPX64"\n",
+ oinfo->oi_oa->o_valid);
RETURN(-EINVAL);
}
- obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid);
- oa->o_id = id;
+ obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid);
+ oinfo->oi_oa->o_id = id;
RETURN(0);
}
-static int echo_setattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti)
+static int echo_setattr(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti)
{
struct obd_device *obd = class_exp2obd(exp);
RETURN(-EINVAL);
}
- if (!(oa->o_valid & OBD_MD_FLID)) {
- CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+ if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+ CERROR("obdo missing FLID valid flag: "LPX64"\n",
+ oinfo->oi_oa->o_valid);
RETURN(-EINVAL);
}
- memcpy(&obd->u.echo.eo_oa, oa, sizeof(*oa));
+ memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
- if (oa->o_id & 4) {
+ if (oinfo->oi_oa->o_id & 4) {
/* Save lock to force ACKed reply */
ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL);
oti->oti_ack_locks[0].mode = LCK_NL;
RETURN(-ENOMEM);
}
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
- LDLM_PLAIN, NULL, LCK_NL, &lock_flags,
- NULL, ldlm_completion_ast, NULL, NULL,
- NULL, 0, NULL, &obd->u.echo.eo_nl_lock);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, LDLM_PLAIN,
+ NULL, LCK_NL, &lock_flags, NULL,
+ ldlm_completion_ast, NULL, NULL,
+ 0, NULL, &obd->u.echo.eo_nl_lock);
LASSERT (rc == ELDLM_OK);
lprocfs_init_vars(echo, &lvars);
obd_size count, struct obd_trans_info *oti)
{
struct echo_client_obd *ec = &obd->u.echo_client;
+ struct obd_info oinfo = { { { 0 } } };
obd_count npages;
struct brw_page *pga;
struct brw_page *pgp;
pgp->flag = 0;
if (verify)
- echo_client_page_debug_setup(lsm, pgp->pg, rw,
+ echo_client_page_debug_setup(lsm, pgp->pg, rw,
oa->o_id, off, pgp->count);
}
- rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti);
+ oinfo.oi_oa = oa;
+ oinfo.oi_md = lsm;
+ rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti);
out:
if (rc != 0 || rw != OBD_BRW_READ)
struct obd_trans_info *oti)
{
struct echo_client_obd *ec = &obd->u.echo_client;
+ struct obd_info oinfo = { { { 0 } } };
obd_count npages;
struct brw_page *pga;
struct brw_page *pgp;
int i;
int rc;
- LASSERT (rw == OBD_BRW_WRITE ||
- rw == OBD_BRW_READ);
+ LASSERT (rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
/* NB: for now, only whole pages, page aligned */
pgp->flag = 0;
}
- rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti);
+ oinfo.oi_oa = oa;
+ oinfo.oi_md = lsm;
+ rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti);
// if (rw == OBD_BRW_READ)
// mark_dirty_kiobuf (kiobuf, count);
struct obd_device *obd = exp->exp_obd;
struct echo_client_obd *ec = &obd->u.echo_client;
struct lustre_handle *ulh = obdo_handle (oa);
+ struct obd_enqueue_info einfo = { 0 };
+ struct obd_info oinfo = { { { 0 } } };
struct ec_object *eco;
struct ec_lock *ecl;
- int flags;
int rc;
if (!(mode == LCK_PR || mode == LCK_PW))
ecl->ecl_policy.l_extent.end =
(nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
- flags = 0;
- rc = obd_enqueue(ec->ec_exp, eco->eco_lsm, LDLM_EXTENT,
- &ecl->ecl_policy, mode, &flags, echo_ldlm_callback,
- ldlm_completion_ast, NULL, eco, sizeof(struct ost_lvb),
- lustre_swab_ost_lvb, &ecl->ecl_lock_handle);
+ einfo.ei_type = LDLM_EXTENT;
+ einfo.ei_mode = mode;
+ einfo.ei_cb_bl = echo_ldlm_callback;
+ einfo.ei_cb_cp = ldlm_completion_ast;
+ einfo.ei_cb_gl = NULL;
+ einfo.ei_cbdata = eco;
+
+ oinfo.oi_policy = ecl->ecl_policy;
+ oinfo.oi_lockh = &ecl->ecl_lock_handle;
+ oinfo.oi_md = eco->eco_lsm;
+ rc = obd_enqueue(ec->ec_exp, &oinfo, &einfo);
if (rc != 0)
goto failed_1;
case OBD_IOC_GETATTR:
rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
if (rc == 0) {
- rc = obd_getattr(ec->ec_exp, &data->ioc_obdo1,
- eco->eco_lsm);
+ struct obd_info oinfo = { { { 0 } } };
+ oinfo.oi_md = eco->eco_lsm;
+ oinfo.oi_oa = &data->ioc_obdo1;
+ rc = obd_getattr(ec->ec_exp, &oinfo);
echo_put_object(eco);
}
GOTO(out, rc);
rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
if (rc == 0) {
- rc = obd_setattr(ec->ec_exp, &data->ioc_obdo1,
- eco->eco_lsm, NULL);
+ struct obd_info oinfo = { { { 0 } } };
+ oinfo.oi_oa = &data->ioc_obdo1;
+ oinfo.oi_md = eco->eco_lsm;
+
+ rc = obd_setattr(ec->ec_exp, &oinfo, NULL);
echo_put_object(eco);
}
GOTO(out, rc);
rc = filter_client_add(obd, filter, fed, cl_idx);
LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
-
fcd = NULL;
exp->exp_replay_needed = 1;
exp->exp_connecting = 0;
if (last_rcvd > le64_to_cpu(fsd->lsd_last_transno))
fsd->lsd_last_transno = cpu_to_le64(last_rcvd);
-
}
if (fcd)
return dparent;
rc = filter_lock_dentry(obd, dparent);
- fsfilt_check_slow(now, obd_timeout, "parent lock");
+ fsfilt_check_slow(obd, now, obd_timeout, "parent lock");
return rc ? ERR_PTR(rc) : dparent;
}
ENTRY;
/* Tell the clients that the object is gone now and that they should
* throw away any cached pages. */
- rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
- LDLM_EXTENT, &policy, LCK_PW,
- &flags, ldlm_blocking_ast, ldlm_completion_ast,
- NULL, NULL, NULL, 0, NULL, &lockh);
+ rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, LDLM_EXTENT,
+ &policy, LCK_PW, &flags, ldlm_blocking_ast,
+ ldlm_completion_ast, NULL, NULL, 0, NULL,
+ &lockh);
/* We only care about the side-effects, just drop the lock. */
if (rc == ELDLM_OK)
static int filter_connect_internal(struct obd_export *exp,
struct obd_connect_data *data)
{
- if (!data)
+ if (!data)
RETURN(0);
-
+
CDEBUG(D_RPCTRACE, "%s: cli %s/%p ocd_connect_flags: "LPX64
" ocd_version: %x ocd_grant: %d\n",
exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
return dchild;
}
-static int filter_getattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md)
+static int filter_getattr(struct obd_export *exp, struct obd_info *oinfo)
{
struct dentry *dentry = NULL;
struct obd_device *obd;
RETURN(-EINVAL);
}
- dentry = filter_oa2dentry(obd, oa);
+ dentry = filter_oa2dentry(obd, oinfo->oi_oa);
if (IS_ERR(dentry))
RETURN(PTR_ERR(dentry));
/* Limit the valid bits in the return data to what we actually use */
- oa->o_valid = OBD_MD_FLID;
- obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS);
+ oinfo->oi_oa->o_valid = OBD_MD_FLID;
+ obdo_from_inode(oinfo->oi_oa, dentry->d_inode, FILTER_VALID_FLAGS);
f_dput(dentry);
RETURN(rc);
unsigned int cur_ids[MAXQUOTAS] = {oa->o_uid, oa->o_gid};
int rc2 = lquota_adjust(quota_interface, exp->exp_obd, cur_ids,
orig_ids, rc, FSFILT_OP_SETATTR);
- CDEBUG(rc2 ? D_ERROR : D_QUOTA,
+ CDEBUG(rc2 ? D_ERROR : D_QUOTA,
"filter adjust qunit. (rc:%d)\n", rc2);
}
return rc;
}
/* this is called from filter_truncate() until we have filter_punch() */
-int filter_setattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti)
+int filter_setattr(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti)
{
- struct ldlm_res_id res_id = { .name = { oa->o_id } };
+ struct ldlm_res_id res_id = { .name = { oinfo->oi_oa->o_id } };
struct ldlm_valblock_ops *ns_lvbo;
struct lvfs_run_ctxt saved;
struct filter_obd *filter;
int rc;
ENTRY;
- dentry = __filter_oa2dentry(exp->exp_obd, oa, __FUNCTION__, 1);
+ dentry = __filter_oa2dentry(exp->exp_obd, oinfo->oi_oa,
+ __FUNCTION__, 1);
if (IS_ERR(dentry))
RETURN(PTR_ERR(dentry));
lock_kernel();
/* setting objects attributes (including owner/group) */
- rc = filter_setattr_internal(exp, dentry, oa, oti);
+ rc = filter_setattr_internal(exp, dentry, oinfo->oi_oa, oti);
if (rc)
GOTO(out_unlock, rc);
ldlm_resource_putref(res);
}
- oa->o_valid = OBD_MD_FLID;
+ oinfo->oi_oa->o_valid = OBD_MD_FLID;
/* Quota release need uid/gid info */
- obdo_from_inode(oa, dentry->d_inode,
+ obdo_from_inode(oinfo->oi_oa, dentry->d_inode,
FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID);
EXIT;
/* set EROFS to state field if FS is mounted as RDONLY. The goal is to
* stop creating files on MDS if OST is not good shape to create
* objects.*/
- osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ?
+ osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ?
EROFS : 0;
RETURN(rc);
}
fcc = obdo_logcookie(oa);
llog_cancel(llog_get_context(obd, fcc->lgc_subsys + 1),
NULL, 1, fcc, 0);
+ fcc = NULL; /* we didn't allocate fcc, don't free it */
}
- fcc = NULL;
GOTO(cleanup, rc = -ENOENT);
}
qcids[USRQUOTA] = oa->o_uid;
qcids[GRPQUOTA] = oa->o_gid;
rc2 = lquota_adjust(quota_interface, obd, qcids, NULL, rc,
- FSFILT_OP_UNLINK);
- CDEBUG(rc2 ? D_ERROR : D_QUOTA,
+ FSFILT_OP_UNLINK);
+ CDEBUG(rc2 ? D_ERROR : D_QUOTA,
"filter adjust qunit! (rc:%d)\n", rc2);
return rc;
}
/* NB start and end are used for punch, but not truncate */
-static int filter_truncate(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_off start,
- obd_off end, struct obd_trans_info *oti)
+static int filter_truncate(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
{
int rc;
ENTRY;
- if (end != OBD_OBJECT_EOF) {
+ if (oinfo->oi_policy.l_extent.end != OBD_OBJECT_EOF) {
CERROR("PUNCH not supported, only truncate: end = "LPX64"\n",
- end);
+ oinfo->oi_policy.l_extent.end);
RETURN(-EFAULT);
}
CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = "LPX64
- ", o_size = "LPD64"\n", oa->o_id, oa->o_valid, start);
-
- oa->o_size = start;
- rc = filter_setattr(exp, oa, NULL, oti);
+ ", o_size = "LPD64"\n", oinfo->oi_oa->o_id,
+ oinfo->oi_oa->o_valid, oinfo->oi_policy.l_extent.start);
+
+ oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start;
+ rc = filter_setattr(exp, oinfo, oti);
RETURN(rc);
}
/* setup llog imports */
ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
rc = llog_receptor_accept(ctxt, exp->exp_imp_reverse);
-
+
lquota_setinfo(quota_interface, exp, obd);
RETURN(rc);
out:
if (quota_interface)
PORTAL_SYMBOL_PUT(filter_quota_interface);
-
+
OBD_FREE(obdfilter_created_scratchpad,
OBDFILTER_CREATED_SCRATCHPAD_ENTRIES *
sizeof(*obdfilter_created_scratchpad));
- }
+ }
return rc;
}
struct obd_export *);
int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
struct obdo *oa, struct obd_trans_info *oti);
-int filter_setattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti);
+int filter_setattr(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti);
struct dentry *filter_create_object(struct obd_device *obd, struct obdo *oa);
int filter_commitrw(int cmd, struct obd_export *, struct obdo *, int objcount,
struct obd_ioobj *, int niocount, struct niobuf_local *,
struct obd_trans_info *, int rc);
-int filter_brw(int cmd, struct obd_export *, struct obdo *,
- struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *,
- struct obd_trans_info *);
+int filter_brw(int cmd, struct obd_export *, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pga, struct obd_trans_info *);
void flip_into_page_cache(struct inode *inode, struct page *new_page);
/* filter_io_*.c */
inode = dentry->d_inode;
obdo_to_inode(inode, oa, OBD_MD_FLATIME);
- fsfilt_check_slow(now, obd_timeout, "preprw_read setup");
+ fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup");
for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
i++, rnb++, lnb++) {
filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
}
- fsfilt_check_slow(now, obd_timeout, "start_page_read");
+ fsfilt_check_slow(obd, now, obd_timeout, "start_page_read");
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
exp, NULL, NULL, NULL);
fso.fso_dentry = dentry;
fso.fso_bufcnt = obj->ioo_bufcnt;
- fsfilt_check_slow(now, obd_timeout, "preprw_write setup");
+ fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup");
spin_lock(&exp->exp_obd->obd_osfs_lock);
if (oa) {
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
NULL, NULL, NULL);
- fsfilt_check_slow(now, obd_timeout, "start_page_write");
+ fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write");
lprocfs_counter_add(exp->exp_obd->obd_stats, LPROC_FILTER_WRITE_BYTES,
tot_bytes);
return -EPROTO;
}
-int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_count oa_bufs,
- struct brw_page *pga, struct obd_trans_info *oti)
+int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pga,
+ struct obd_trans_info *oti)
{
struct obd_ioobj ioo;
struct niobuf_local *lnb;
rnb[i].len = pga[i].count;
}
- obdo_to_ioobj(oa, &ioo);
+ obdo_to_ioobj(oinfo->oi_oa, &ioo);
ioo.ioo_bufcnt = oa_bufs;
- ret = filter_preprw(cmd, exp, oa, 1, &ioo, oa_bufs, rnb, lnb, oti);
+ ret = filter_preprw(cmd, exp, oinfo->oi_oa, 1, &ioo,
+ oa_bufs, rnb, lnb, oti);
if (ret != 0)
GOTO(out, ret);
- ret = filter_commitrw(cmd, exp, oa, 1, &ioo, oa_bufs, lnb, oti, ret);
+ ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo,
+ oa_bufs, lnb, oti, ret);
out:
if (lnb)
GOTO(cleanup, rc);
}
- fsfilt_check_slow(now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
if (rc == 0)
obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
- fsfilt_check_slow(now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
LASSERTF(oti->oti_transno <= obd->obd_last_committed,
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);
cleanup_phase = 2;
LOCK_INODE_MUTEX(inode);
- fsfilt_check_slow(now, obd_timeout, "i_mutex");
+ fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
oti);
if (IS_ERR(oti->oti_handle)) {
}
/* have to call fsfilt_commit() from this point on */
- fsfilt_check_slow(now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
iattr.ia_mode &= ~S_ISGID;
rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
+
+ /* To avoid problems with quotas, UID and GID must be set
+ * in the inode before filter_direct_io() - see bug 10357. */
+ if (iattr.ia_valid & ATTR_UID)
+ inode->i_uid = iattr.ia_uid;
+ if (iattr.ia_valid & ATTR_GID)
+ inode->i_gid = iattr.ia_gid;
}
/* filter_direct_io drops i_mutex */
lquota_getflag(quota_interface, obd, oa);
- fsfilt_check_slow(now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);
struct obd_device *obd = ctxt->loc_obd;
struct obd_export *exp = obd->obd_self_export;
struct llog_setattr_rec *lsr;
- struct obdo *oa;
+ struct obd_info oinfo = { { { 0 } } };
obd_id oid;
int rc = 0;
ENTRY;
lsr = (struct llog_setattr_rec *)rec;
- oa = obdo_alloc();
-
- oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID |
- OBD_MD_FLCOOKIE);
- oa->o_id = lsr->lsr_oid;
- oa->o_gr = lsr->lsr_ogen;
- oa->o_uid = lsr->lsr_uid;
- oa->o_gid = lsr->lsr_gid;
- memcpy(obdo_logcookie(oa), cookie, sizeof(*cookie));
- oid = oa->o_id;
-
- rc = filter_setattr(exp, oa, NULL, NULL);
- obdo_free(oa);
+ oinfo.oi_oa = obdo_alloc();
+
+ oinfo.oi_oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID |
+ OBD_MD_FLCOOKIE);
+ oinfo.oi_oa->o_id = lsr->lsr_oid;
+ oinfo.oi_oa->o_gr = lsr->lsr_ogen;
+ oinfo.oi_oa->o_uid = lsr->lsr_uid;
+ oinfo.oi_oa->o_gid = lsr->lsr_gid;
+ memcpy(obdo_logcookie(oinfo.oi_oa), cookie, sizeof(*cookie));
+ oid = oinfo.oi_oa->o_id;
+
+ rc = filter_setattr(exp, &oinfo, NULL);
+ obdo_free(oinfo.oi_oa);
if (rc == -ENOENT) {
CDEBUG(D_HA, "object already removed, send cookie\n");
{
struct obd_device *dev = data;
struct client_obd *cli = &dev->u.cli;
- unsigned val;
+ long val;
+ int mult;
client_obd_list_lock(&cli->cl_loi_list_lock);
- val = cli->cl_dirty_max >> 20;
+ val = cli->cl_dirty_max;
+ spin_unlock(&cli->cl_loi_list_lock);
client_obd_list_unlock(&cli->cl_loi_list_lock);
- return snprintf(page, count, "%u\n", val);
+ mult = 1 << 20;
+ return lprocfs_read_frac_helper(page, count, val, mult);
}
static int osc_wr_max_dirty_mb(struct file *file, const char *buffer,
{
struct obd_device *dev = data;
struct client_obd *cli = &dev->u.cli;
- int val, rc;
+ int pages_number, mult, rc;
- rc = lprocfs_write_helper(buffer, count, &val);
+ mult = 1 << (20 - PAGE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
if (rc)
return rc;
- if (val < 0 || val > OSC_MAX_DIRTY_MB_MAX ||
- val > num_physpages >> (20 - PAGE_SHIFT - 2)) /* 1/4 of RAM */
+ if (pages_number < 0 || pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
+ pages_number > num_physpages / 4) /* 1/4 of RAM */
return -ERANGE;
client_obd_list_lock(&cli->cl_loi_list_lock);
- cli->cl_dirty_max = (obd_count)val * 1024 * 1024;
+ cli->cl_dirty_max = (obd_count)(pages_number << PAGE_SHIFT);
osc_wake_cache_waiters(cli);
client_obd_list_unlock(&cli->cl_loi_list_lock);
}
static int osc_getattr_interpret(struct ptlrpc_request *req,
- struct osc_getattr_async_args *aa, int rc)
+ struct osc_async_args *aa, int rc)
{
struct ost_body *body;
ENTRY;
if (rc != 0)
- RETURN(rc);
+ GOTO(out, rc);
body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
lustre_swab_ost_body);
if (body) {
CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
- memcpy(aa->aa_oa, &body->oa, sizeof(*aa->aa_oa));
+ memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa));
/* This should really be sent by the OST */
- aa->aa_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
- aa->aa_oa->o_valid |= OBD_MD_FLBLKSZ;
+ aa->aa_oi->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
+ aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
} else {
CERROR("can't unpack ost_body\n");
rc = -EPROTO;
- aa->aa_oa->o_valid = 0;
+ aa->aa_oi->oi_oa->o_valid = 0;
}
-
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
RETURN(rc);
}
-static int osc_getattr_async(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md,
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
struct ptlrpc_request_set *set)
{
struct ptlrpc_request *req;
struct ost_body *body;
int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
- struct osc_getattr_async_args *aa;
+ struct osc_async_args *aa;
ENTRY;
req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
- memcpy(&body->oa, oa, sizeof(*oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
ptlrpc_req_set_repsize(req, 2, size);
req->rq_interpret_reply = osc_getattr_interpret;
LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
- aa = (struct osc_getattr_async_args *)&req->rq_async_args;
- aa->aa_oa = oa;
+ aa = (struct osc_async_args *)&req->rq_async_args;
+ aa->aa_oi = oinfo;
ptlrpc_set_add_req(set, req);
RETURN (0);
}
-static int osc_getattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md)
+static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo)
{
struct ptlrpc_request *req;
struct ost_body *body;
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
- memcpy(&body->oa, oa, sizeof(*oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
ptlrpc_req_set_repsize(req, 2, size);
}
CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
- memcpy(oa, &body->oa, sizeof(*oa));
+ memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa));
/* This should really be sent by the OST */
- oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
- oa->o_valid |= OBD_MD_FLBLKSZ;
+ oinfo->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
+ oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
EXIT;
out:
return rc;
}
-static int osc_setattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti)
+static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti)
{
struct ptlrpc_request *req;
struct ost_body *body;
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
- memcpy(&body->oa, oa, sizeof(*oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
ptlrpc_req_set_repsize(req, 2, size);
if (body == NULL)
GOTO(out, rc = -EPROTO);
- memcpy(oa, &body->oa, sizeof(*oa));
+ memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa));
EXIT;
out:
ptlrpc_req_finished(req);
- RETURN(0);
+ RETURN(rc);
}
-static int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md,
- struct obd_trans_info *oti)
+static int osc_setattr_interpret(struct ptlrpc_request *req,
+ struct osc_async_args *aa, int rc)
{
- struct ptlrpc_request *req;
struct ost_body *body;
- int rc = 0, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
ENTRY;
- LASSERT(oti);
+ if (rc != 0)
+ GOTO(out, rc);
+
+ body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
+ lustre_swab_ost_body);
+ if (body == NULL) {
+ CERROR("can't unpack ost_body\n");
+ GOTO(out, rc = -EPROTO);
+ }
+
+ memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa));
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+ RETURN(rc);
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
+{
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+ struct osc_async_args *aa;
+ ENTRY;
req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
OST_SETATTR, 2, size, NULL);
body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
- if (oa->o_valid & OBD_MD_FLCOOKIE)
- memcpy(obdo_logcookie(oa), oti->oti_logcookies,
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+ LASSERT(oti);
+ memcpy(obdo_logcookie(oinfo->oi_oa), oti->oti_logcookies,
sizeof(*oti->oti_logcookies));
+ }
- memcpy(&body->oa, oa, sizeof(*oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
ptlrpc_req_set_repsize(req, 2, size);
/* do mds to ost setattr asynchronouly */
- ptlrpcd_add_req(req);
+ if (!rqset) {
+ /* Do not wait for response. */
+ ptlrpcd_add_req(req);
+ } else {
+ req->rq_interpret_reply = osc_setattr_interpret;
- RETURN(rc);
+ LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+ aa = (struct osc_async_args *)&req->rq_async_args;
+ aa->aa_oi = oinfo;
+
+ ptlrpc_set_add_req(rqset, req);
+ }
+
+ RETURN(0);
}
int osc_real_create(struct obd_export *exp, struct obdo *oa,
return rc;
}
-static int osc_punch(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, obd_size start,
- obd_size end, struct obd_trans_info *oti)
+static int osc_punch_interpret(struct ptlrpc_request *req,
+ struct osc_async_args *aa, int rc)
+{
+ struct ost_body *body;
+ ENTRY;
+
+ if (rc != 0)
+ GOTO(out, rc);
+
+ body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof (*body),
+ lustre_swab_ost_body);
+ if (body == NULL) {
+ CERROR ("can't unpack ost_body\n");
+ GOTO(out, rc = -EPROTO);
+ }
+
+ memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa));
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+ RETURN(rc);
+}
+
+static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
{
struct ptlrpc_request *req;
+ struct osc_async_args *aa;
struct ost_body *body;
- int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+ int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
ENTRY;
- if (!oa) {
+ if (!oinfo->oi_oa) {
CERROR("oa NULL\n");
RETURN(-EINVAL);
}
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
- memcpy(&body->oa, oa, sizeof(*oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
/* overload the size and blocks fields in the oa with start/end */
- body->oa.o_size = start;
- body->oa.o_blocks = end;
+ body->oa.o_size = oinfo->oi_policy.l_extent.start;
+ body->oa.o_blocks = oinfo->oi_policy.l_extent.end;
body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
ptlrpc_req_set_repsize(req, 2, size);
- rc = ptlrpc_queue_wait(req);
- if (rc)
- GOTO(out, rc);
-
- body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
- lustre_swab_ost_body);
- if (body == NULL) {
- CERROR ("can't unpack ost_body\n");
- GOTO (out, rc = -EPROTO);
- }
-
- memcpy(oa, &body->oa, sizeof(*oa));
+ req->rq_interpret_reply = osc_punch_interpret;
+ LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+ aa = (struct osc_async_args *)&req->rq_async_args;
+ aa->aa_oi = oinfo;
+ ptlrpc_set_add_req(rqset, req);
- EXIT;
- out:
- ptlrpc_req_finished(req);
- return rc;
+ RETURN(0);
}
static int osc_sync(struct obd_export *exp, struct obdo *oa,
return rc;
}
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
static int osc_destroy(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *ea, struct obd_trans_info *oti,
struct obd_export *md_export)
{
struct ptlrpc_request *req;
struct ost_body *body;
- int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+ int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
ENTRY;
if (!oa) {
if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) {
memcpy(obdo_logcookie(oa), oti->oti_logcookies,
sizeof(*oti->oti_logcookies));
- oti->oti_logcookies++;
}
memcpy(&body->oa, oa, sizeof(*oa));
ptlrpc_req_set_repsize(req, 2, size);
- rc = ptlrpc_queue_wait(req);
- if (rc == -ENOENT)
- rc = 0;
- if (rc)
- GOTO(out, rc);
-
- body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
- lustre_swab_ost_body);
- if (body == NULL) {
- CERROR ("Can't unpack body\n");
- GOTO (out, rc = -EPROTO);
- }
-
- memcpy(oa, &body->oa, sizeof(*oa));
-
- EXIT;
- out:
- ptlrpc_req_finished(req);
- return rc;
+ ptlrpcd_add_req(req);
+ RETURN(0);
}
static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
int rc;
ENTRY;
+ /* Consume write credits even if doing a sync write -
+ * otherwise we may run out of space on OST due to grant. */
+ spin_lock(&exp->exp_obd->u.cli.cl_loi_list_lock);
+ for (nio_count = 0; nio_count < page_count; nio_count++) {
+ if (exp->exp_obd->u.cli.cl_avail_grant >= PAGE_SIZE) {
+ exp->exp_obd->u.cli.cl_avail_grant -= PAGE_SIZE;
+ pga[nio_count]->flag |= OBD_BRW_FROM_GRANT;
+ }
+ }
+ spin_unlock(&exp->exp_obd->u.cli.cl_loi_list_lock);
+
rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
page_count, pga, &requested_nob, &nio_count,
&req);
OBD_FREE(ppga, sizeof(*ppga) * count);
}
-static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, obd_count page_count,
- struct brw_page *pga, struct obd_trans_info *oti)
+static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+ obd_count page_count, struct brw_page *pga,
+ struct obd_trans_info *oti)
{
struct obdo *saved_oa = NULL;
struct brw_page **ppga, **orig;
RETURN(-ENOMEM);
page_count_orig = page_count;
+ sort_brw_pages(ppga, page_count);
while (page_count) {
obd_count pages_per_brw;
else
pages_per_brw = page_count;
- sort_brw_pages(ppga, pages_per_brw);
pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
if (saved_oa != NULL) {
/* restore previously saved oa */
- *oa = *saved_oa;
+ *oinfo->oi_oa = *saved_oa;
} else if (page_count > pages_per_brw) {
/* save a copy of oa (brw will clobber it) */
saved_oa = obdo_alloc();
if (saved_oa == NULL)
GOTO(out, rc = -ENOMEM);
- *saved_oa = *oa;
+ *saved_oa = *oinfo->oi_oa;
}
- rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, ppga);
+ rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+ pages_per_brw, ppga);
if (rc != 0)
break;
RETURN(rc);
}
-static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, obd_count page_count,
- struct brw_page *pga, struct ptlrpc_request_set *set,
- struct obd_trans_info *oti)
+static int osc_brw_async(int cmd, struct obd_export *exp,
+ struct obd_info *oinfo, obd_count page_count,
+ struct brw_page *pga, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
{
struct brw_page **ppga, **orig;
int page_count_orig;
RETURN(-ENOMEM);
page_count_orig = page_count;
+ sort_brw_pages(ppga, page_count);
while (page_count) {
obd_count pages_per_brw;
else
pages_per_brw = page_count;
- sort_brw_pages(ppga, pages_per_brw);
pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
- rc = async_internal(cmd, exp, oa, md, pages_per_brw, ppga, set);
+ rc = async_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+ pages_per_brw, ppga, set);
if (rc != 0)
break;
/* Note: caller will lock/unlock, and set uptodate on the pages */
#if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_count page_count,
- struct brw_page *pga)
+static int sanosc_brw_read(struct obd_export *exp, struct obd_info *oinfo,
+ obd_count page_count, struct brw_page *pga)
{
struct ptlrpc_request *req = NULL;
struct ost_body *body;
nioptr = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
sizeof(*nioptr) * page_count);
- memcpy(&body->oa, oa, sizeof(body->oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(body->oa));
- obdo_to_ioobj(oa, iooptr);
+ obdo_to_ioobj(oinfo->oi_oa, iooptr);
iooptr->ioo_bufcnt = page_count;
for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
GOTO(out_req, rc = -EPROTO);
}
- memcpy(oa, &body->oa, sizeof(*oa));
+ memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa));
swab = lustre_msg_swabbed(req->rq_repmsg);
LASSERT_REPSWAB(req, REPLY_REC_OFF + 1);
RETURN(rc);
}
-static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_count page_count,
- struct brw_page *pga)
+static int sanosc_brw_write(struct obd_export *exp, struct obd_info *oinfo,
+ obd_count page_count, struct brw_page *pga)
{
struct ptlrpc_request *req = NULL;
struct ost_body *body;
nioptr = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
sizeof(*nioptr) * page_count);
- memcpy(&body->oa, oa, sizeof(body->oa));
+ memcpy(&body->oa, oinfo->oi_oa, sizeof(body->oa));
- obdo_to_ioobj(oa, iooptr);
+ obdo_to_ioobj(oinfo->oi_oa, iooptr);
iooptr->ioo_bufcnt = page_count;
/* pack request */
RETURN(rc);
}
-static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, obd_count page_count,
- struct brw_page *pga, struct obd_trans_info *oti)
+static int sanosc_brw(int cmd, struct obd_export *exp, struct obd_infl *oinfo,
+ obd_count page_count, struct brw_page *pga,
+ struct obd_trans_info *oti)
{
ENTRY;
pages_per_brw = page_count;
if (cmd & OBD_BRW_WRITE)
- rc = sanosc_brw_write(exp, oa, lsm, pages_per_brw,pga);
+ rc = sanosc_brw_write(exp, oinfo, pages_per_brw, pga);
else
- rc = sanosc_brw_read(exp, oa, lsm, pages_per_brw, pga);
+ rc = sanosc_brw_read(exp, oinfo, pages_per_brw, pga);
if (rc != 0)
RETURN(rc);
return 0;
}
-static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
- __u32 type, ldlm_policy_data_t *policy, __u32 mode,
- int *flags, void *bl_cb, void *cp_cb, void *gl_cb,
- void *data, __u32 lvb_len, void *lvb_swabber,
- struct lustre_handle *lockh)
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct obd_info *oinfo,
+ int intent, int rc)
{
- struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+ ENTRY;
+
+ if (intent) {
+ /* The request was created before ldlm_cli_enqueue call. */
+ if (rc == ELDLM_LOCK_ABORTED) {
+ struct ldlm_reply *rep;
+
+ /* swabbed by ldlm_cli_enqueue() */
+ LASSERT_REPSWABBED(req, DLM_LOCKREPLY_OFF);
+ rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF,
+ sizeof(*rep));
+ LASSERT(rep != NULL);
+ if (rep->lock_policy_res1)
+ rc = rep->lock_policy_res1;
+ }
+ }
+
+ if ((intent && rc == ELDLM_LOCK_ABORTED) || !rc) {
+ CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
+ oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_size,
+ oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_blocks,
+ oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_mtime);
+ }
+
+ /* Call the update callback. */
+ rc = oinfo->oi_cb_up(oinfo, rc);
+ RETURN(rc);
+}
+
+static int osc_enqueue_interpret(struct ptlrpc_request *req,
+ struct osc_enqueue_args *aa, int rc)
+{
+ int intent = aa->oa_ei->ei_flags & LDLM_FL_HAS_INTENT;
+ struct lov_stripe_md *lsm = aa->oa_oi->oi_md;
+ struct ldlm_lock *lock;
+
+ /* ldlm_cli_enqueue is holding a reference on the lock, so it must
+ * be valid. */
+ lock = ldlm_handle2lock(aa->oa_oi->oi_lockh);
+
+ /* Complete obtaining the lock procedure. */
+ rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+ aa->oa_ei->ei_mode,
+ &aa->oa_ei->ei_flags,
+ &lsm->lsm_oinfo->loi_lvb,
+ sizeof(lsm->lsm_oinfo->loi_lvb),
+ lustre_swab_ost_lvb,
+ aa->oa_oi->oi_lockh, rc);
+
+ /* Complete osc stuff. */
+ rc = osc_enqueue_fini(req, aa->oa_oi, intent, rc);
+
+ /* Release the lock for async request. */
+ if (lustre_handle_is_used(aa->oa_oi->oi_lockh) && rc == ELDLM_OK)
+ ldlm_lock_decref(aa->oa_oi->oi_lockh, aa->oa_ei->ei_mode);
+
+ LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+ aa->oa_oi->oi_lockh, req, aa);
+ LDLM_LOCK_PUT(lock);
+ return rc;
+}
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_enqueue_info *einfo)
+{
+ struct ldlm_res_id res_id = { .name = {oinfo->oi_md->lsm_object_id} };
struct obd_device *obd = exp->exp_obd;
- struct ost_lvb lvb;
struct ldlm_reply *rep;
struct ptlrpc_request *req = NULL;
+ int intent = einfo->ei_flags & LDLM_FL_HAS_INTENT;
int rc;
ENTRY;
/* Filesystem lock extents are extended to page boundaries so that
* dealing with the page cache is a little smoother. */
- policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
- policy->l_extent.end |= ~CFS_PAGE_MASK;
+ oinfo->oi_policy.l_extent.start -=
+ oinfo->oi_policy.l_extent.start & ~CFS_PAGE_MASK;
+ oinfo->oi_policy.l_extent.end |= ~CFS_PAGE_MASK;
- if (lsm->lsm_oinfo->loi_kms_valid == 0)
+ if (oinfo->oi_md->lsm_oinfo->loi_kms_valid == 0)
goto no_match;
/* Next, search for already existing extent locks that will cover us */
- rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy,
- mode, lockh);
+ rc = ldlm_lock_match(obd->obd_namespace, einfo->ei_flags, &res_id,
+ einfo->ei_type, &oinfo->oi_policy, einfo->ei_mode,
+ oinfo->oi_lockh);
if (rc == 1) {
- osc_set_data_with_check(lockh, data, *flags);
- if (*flags & LDLM_FL_HAS_INTENT) {
+ osc_set_data_with_check(oinfo->oi_lockh, einfo->ei_cbdata,
+ einfo->ei_flags);
+ if (intent) {
/* I would like to be able to ASSERT here that rss <=
* kms, but I can't, for reasons which are explained in
* lov_enqueue() */
}
+
+ /* For async requests, decref the lock. */
+ if (einfo->ei_rqset)
+ ldlm_lock_decref(oinfo->oi_lockh, einfo->ei_mode);
+
/* We already have a lock, and it's referenced */
+ oinfo->oi_cb_up(oinfo, ELDLM_OK);
RETURN(ELDLM_OK);
}
* send us a blocking callback, but there are problems with canceling
* locks out from other users right now, too. */
- if (mode == LCK_PR) {
- rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
- policy, LCK_PW, lockh);
+ if (einfo->ei_mode == LCK_PR) {
+ rc = ldlm_lock_match(obd->obd_namespace, einfo->ei_flags,
+ &res_id, einfo->ei_type, &oinfo->oi_policy,
+ LCK_PW, oinfo->oi_lockh);
if (rc == 1) {
/* FIXME: This is not incredibly elegant, but it might
* be more elegant than adding another parameter to
* lock_match. I want a second opinion. */
- ldlm_lock_addref(lockh, LCK_PR);
- ldlm_lock_decref(lockh, LCK_PW);
- osc_set_data_with_check(lockh, data, *flags);
+ /* addref the lock only if not async requests. */
+ if (!einfo->ei_rqset)
+ ldlm_lock_addref(oinfo->oi_lockh, LCK_PR);
+ osc_set_data_with_check(oinfo->oi_lockh,
+ einfo->ei_cbdata,
+ einfo->ei_flags);
+ ldlm_lock_decref(oinfo->oi_lockh, LCK_PW);
+ oinfo->oi_cb_up(oinfo, ELDLM_OK);
RETURN(ELDLM_OK);
}
}
no_match:
- if (*flags & LDLM_FL_HAS_INTENT) {
+ if (intent) {
int size[3] = {
[MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
[DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request) };
RETURN(-ENOMEM);
size[DLM_LOCKREPLY_OFF] = sizeof(*rep);
- size[DLM_REPLY_REC_OFF] = sizeof(lvb);
+ size[DLM_REPLY_REC_OFF] =
+ sizeof(oinfo->oi_md->lsm_oinfo->loi_lvb);
ptlrpc_req_set_repsize(req, 3, size);
}
/* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
- *flags &= ~LDLM_FL_BLOCK_GRANTED;
-
- rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type,
- policy, mode, flags, bl_cb, cp_cb, gl_cb, data,
- &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh);
-
- if (req != NULL) {
- if (rc == ELDLM_LOCK_ABORTED) {
- /* swabbed by ldlm_cli_enqueue() */
- LASSERT_REPSWABBED(req, DLM_LOCKREPLY_OFF);
- rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF,
- sizeof(*rep));
- LASSERT(rep != NULL);
- if (rep->lock_policy_res1)
- rc = rep->lock_policy_res1;
+ einfo->ei_flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+ rc = ldlm_cli_enqueue(exp, &req, res_id, einfo->ei_type,
+ &oinfo->oi_policy, einfo->ei_mode,
+ &einfo->ei_flags, einfo->ei_cb_bl,
+ einfo->ei_cb_cp, einfo->ei_cb_gl,
+ einfo->ei_cbdata,
+ &oinfo->oi_md->lsm_oinfo->loi_lvb,
+ sizeof(oinfo->oi_md->lsm_oinfo->loi_lvb),
+ lustre_swab_ost_lvb, oinfo->oi_lockh,
+ einfo->ei_rqset ? 1 : 0);
+ if (einfo->ei_rqset) {
+ if (!rc) {
+ struct osc_enqueue_args *aa;
+ LASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = (struct osc_enqueue_args *)&req->rq_async_args;
+ aa->oa_oi = oinfo;
+ aa->oa_ei = einfo;
+ aa->oa_exp = exp;
+
+ req->rq_interpret_reply = osc_enqueue_interpret;
+ ptlrpc_set_add_req(einfo->ei_rqset, req);
+ } else if (intent) {
+ ptlrpc_req_finished(req);
}
- ptlrpc_req_finished(req);
+ RETURN(rc);
}
- if ((*flags & LDLM_FL_HAS_INTENT && rc == ELDLM_LOCK_ABORTED) || !rc) {
- CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
- lvb.lvb_size, lvb.lvb_blocks, lvb.lvb_mtime);
- lsm->lsm_oinfo->loi_lvb = lvb;
- }
+ rc = osc_enqueue_fini(req, oinfo, intent, rc);
+ if (intent)
+ ptlrpc_req_finished(req);
RETURN(rc);
}
return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join);
}
+static int osc_statfs_interpret(struct ptlrpc_request *req,
+ struct osc_async_args *aa, int rc)
+{
+ struct obd_statfs *msfs;
+ ENTRY;
+
+ if (rc != 0)
+ GOTO(out, rc);
+
+ msfs = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*msfs),
+ lustre_swab_obd_statfs);
+ if (msfs == NULL) {
+ CERROR("Can't unpack obd_statfs\n");
+ GOTO(out, rc = -EPROTO);
+ }
+
+ memcpy(aa->aa_oi->oi_osfs, msfs, sizeof(*msfs));
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+ RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo,
+ unsigned long max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ struct ptlrpc_request *req;
+ struct osc_async_args *aa;
+ int size[2] = { sizeof(struct ptlrpc_body), sizeof(*oinfo->oi_osfs) };
+ ENTRY;
+
+ /* We could possibly pass max_age in the request (as an absolute
+ * timestamp or a "seconds.usec ago") so the target can avoid doing
+ * extra calls into the filesystem if that isn't necessary (e.g.
+ * during mount that would help a bit). Having relative timestamps
+ * is not so great if request processing is slow, while absolute
+ * timestamps are not ideal because they need time synchronization. */
+ req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OST_VERSION,
+ OST_STATFS, 1, NULL, NULL);
+ if (!req)
+ RETURN(-ENOMEM);
+
+ ptlrpc_req_set_repsize(req, 2, size);
+ req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+
+ req->rq_interpret_reply = osc_statfs_interpret;
+ LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+ aa = (struct osc_async_args *)&req->rq_async_args;
+ aa->aa_oi = oinfo;
+
+ ptlrpc_set_add_req(rqset, req);
+ RETURN(0);
+}
+
static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
cfs_time_t max_age)
{
.o_reconnect = osc_reconnect,
.o_disconnect = osc_disconnect,
.o_statfs = osc_statfs,
+ .o_statfs_async = osc_statfs_async,
.o_packmd = osc_packmd,
.o_unpackmd = osc_unpackmd,
.o_create = osc_create,
.o_reconnect = osc_reconnect,
.o_disconnect = client_disconnect_export,
.o_statfs = osc_statfs,
+ .o_statfs_async = osc_statfs_async,
.o_packmd = osc_packmd,
.o_unpackmd = osc_unpackmd,
.o_create = osc_real_create,
.o_getattr = osc_getattr,
.o_getattr_async = osc_getattr_async,
.o_setattr = osc_setattr,
+ .o_setattr_async = osc_setattr_async,
.o_brw = sanosc_brw,
.o_punch = osc_punch,
.o_sync = osc_sync,
static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req)
{
struct ost_body *body, *repbody;
+ struct obd_info oinfo = { { { 0 } } };
int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
ENTRY;
repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
sizeof(*repbody));
memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
- req->rq_status = obd_getattr(exp, &repbody->oa, NULL);
+
+ oinfo.oi_oa = &repbody->oa;
+ req->rq_status = obd_getattr(exp, &oinfo);
RETURN(0);
}
else
policy.l_extent.end = finis | ~CFS_PAGE_MASK;
- RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace,
- res_id, LDLM_EXTENT, &policy, LCK_PW, &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- ldlm_glimpse_ast,
- NULL, NULL, 0, NULL, lh));
+ RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, res_id,
+ LDLM_EXTENT, &policy, LCK_PW, &flags,
+ ldlm_blocking_ast, ldlm_completion_ast,
+ ldlm_glimpse_ast, NULL, 0, NULL, lh));
}
/*
static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
struct obd_trans_info *oti)
{
- struct obdo *oa;
+ struct obd_info oinfo = { { { 0 } } };
struct ost_body *body, *repbody;
int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
struct lustre_handle lh = {0,};
if (body == NULL)
RETURN(-EFAULT);
- oa = &body->oa;
- if ((oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) !=
+ oinfo.oi_oa = &body->oa;
+ oinfo.oi_policy.l_extent.start = oinfo.oi_oa->o_size;
+ oinfo.oi_policy.l_extent.end = oinfo.oi_oa->o_blocks;
+
+ if ((oinfo.oi_oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) !=
(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))
RETURN(-EINVAL);
repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
sizeof(*repbody));
- repbody->oa = *oa;
- rc = ost_punch_lock_get(exp, oa, &lh);
+ repbody->oa = *oinfo.oi_oa;
+ rc = ost_punch_lock_get(exp, oinfo.oi_oa, &lh);
if (rc == 0) {
- if (oa->o_valid & OBD_MD_FLFLAGS &&
- oa->o_flags == OBD_FL_TRUNCLOCK)
+ if (oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
+ oinfo.oi_oa->o_flags == OBD_FL_TRUNCLOCK)
/*
* If OBD_FL_TRUNCLOCK is the only bit set in
* ->o_flags, clear OBD_MD_FLFLAGS to avoid falling
* through filter_setattr() to filter_iocontrol().
*/
- oa->o_valid &= ~OBD_MD_FLFLAGS;
+ oinfo.oi_oa->o_valid &= ~OBD_MD_FLFLAGS;
- req->rq_status = obd_punch(exp, oa, NULL,
- oa->o_size, oa->o_blocks, oti);
- ost_punch_lock_put(exp, oa, &lh);
+ req->rq_status = obd_punch(exp, &oinfo, oti, NULL);
+ ost_punch_lock_put(exp, oinfo.oi_oa, &lh);
}
RETURN(rc);
}
{
struct ost_body *body, *repbody;
int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+ struct obd_info oinfo = { { { 0 } } };
ENTRY;
body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
sizeof(*repbody));
memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
- req->rq_status = obd_setattr(exp, &repbody->oa, NULL, oti);
+ oinfo.oi_oa = &repbody->oa;
+ req->rq_status = obd_setattr(exp, &oinfo, oti);
RETURN(0);
}
policy.l_extent.end = (nb[nrbufs - 1].offset +
nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK;
- RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace,
- res_id, LDLM_EXTENT, &policy, mode, &flags,
- ldlm_blocking_ast, ldlm_completion_ast,
- ldlm_glimpse_ast,
- NULL, NULL, 0, NULL, lh));
+ RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, res_id,
+ LDLM_EXTENT, &policy, mode, &flags,
+ ldlm_blocking_ast, ldlm_completion_ast,
+ ldlm_glimpse_ast, NULL, 0, NULL, lh));
}
static void ost_brw_lock_put(int mode,
#!/bin/sh
-MNT=${MNT:-/mnt/lustre}
-DIR=${DIR:-$MNT/`hostname`}
+MOUNT=${MOUNT:-/mnt/lustre}
+DIR=${DIR:-$MOUNT/`hostname`}
#[ -e /proc/sys/lnet/debug ] && echo 0 > /proc/sys/lnet/debug
mkdir -p $DIR
TGT=$DIR/client.txt
#!/bin/sh
+DELAY=1
null() {
:
}
-if [ "$1" == "-q" ]; then
- echo "echo off"
+case "$1" in
+-q) echo "echo off"
ECHO="null"
- shift
-else
+ shift ;;
+[1-9]*)
+ DELAY=$1
+ ECHO=echo
+ shift ;;
+*)
echo "echo on"
ECHO=echo
-fi
-vmstat 1 | while read LINE ; do
+esac
+
+vmstat $DELAY | while read LINE ; do
LINE="`date +%s`: $LINE"
$ECHO "$LINE"
[ "$1" ] && echo "$LINE" >> $1
setup() {
echo -n "mnt.."
+ load_modules
setupall || exit 10
echo "done"
}
+
SETUP=${SETUP:-:}
log() {
done
AFTER_INODES=`num_inodes`
echo "after inodes: $AFTER_INODES"
- [ $AFTER_INODES -gt $((BEFORE_INODES + 10)) ] && \
+ [ $AFTER_INODES -gt $((BEFORE_INODES + 32)) ] && \
error "inode slab grew from $BEFORE_INODES to $AFTER_INODES"
true
}
-run_test 76 "destroy duplicate inodes in client inode cache"
+run_test 76 "destroy duplicate inodes in client inode cache ===="
test_77() {
sh qos.sh
export REFORMAT=""
export VERBOSE=false
export GMNALNID=${GMNALNID:-/usr/sbin/gmlndnid}
+export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe}
# eg, assert_env LUSTRE MDSNODES OSTNODES CLIENTS
assert_env() {
# verify that lustre actually cleaned up properly
cleanup_check() {
+ [ -e $CATASTROPHE -a "`cat $CATASTROPHE`" = "1" ] && echo "LBUG" && exit 206
BUSY=`dmesg | grep -i destruct || true`
if [ "$BUSY" ]; then
echo "$BUSY" 1>&2
#check_mds
test_${testnum} || error "test_$testnum failed with $?"
#check_mds
+ [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \
+ error "LBUG/LASSERT detected"
pass "($((`date +%s` - $BEFORE))s)"
}
lload_LDADD := $(LIBREADLINE) $(LIBPTLCTL)
lload_DEPENDENCIES := $(LIBPTLCTL)
-llverfs_LDADD := -lext2fs -le2p
+if EXT2FS_DEVEL
+EXT2FSLIB = -lext2fs
+E2PLIB = -le2p
+else
+E2PLIB =
+EXT2FSLIB =
+endif
+
if BLKID
-llverdev_LDADD := -lext2fs -lblkid
+BLKIDLIB = -lblkid
else
-llverdev_LDADD := -lext2fs
+BLKIDLIB =
endif
+llverfs_LDADD := $(EXT2FSLIB) $(E2PLIB)
+
+llverdev_LDADD := $(EXT2FSLIB) $(BLKIDLIB)
+
liblustreapi_a_SOURCES = liblustreapi.c
wirecheck_SOURCES = wirecheck.c
#include <sys/mount.h>
#include <sys/time.h>
#include <gnu/stubs.h>
-#include <ext2fs/ext2fs.h>
+
+#ifdef HAVE_EXT2FS_EXT2FS_H
+# include <ext2fs/ext2fs.h>
+#endif
#define ONE_MB (1024 * 1024)
#define ONE_GB (1024 * 1024 * 1024)
*/
static int open_dev(const char *devname, int mode)
{
+#ifdef HAVE_EXT2FS_EXT2FS_H
int mount_flags;
char mountpt[80] = "";
devname);
exit(1);
}
+#endif
fd = open(devname, mode | O_EXCL | O_LARGEFILE);
if (fd < 0) {
fprintf(stderr, "%s: Open failed: %s",progname,strerror(errno));
#include <sys/stat.h>
#include <sys/vfs.h>
#include <gnu/stubs.h>
-#include <ext2fs/ext2fs.h>
#include <gnu/stubs.h>
-#include <e2p/e2p.h>
+
+#ifdef HAVE_EXT2FS_EXT2FS_H
+# include <e2p/e2p.h>
+# include <ext2fs/ext2fs.h>
+#endif
#define ONE_MB (1024 * 1024)
#define ONE_GB ((unsigned long long)(1024 * 1024 * 1024))
int file_num = 999999999;
ino_t inode_st = 0;
+#ifdef HAVE_EXT2FS_EXT2FS_H
if (!full && fsetflags(testdir, EXT2_TOPDIR_FL))
fprintf(stderr,
"\n%s: can't set TOPDIR_FL on %s: %s (ignoring)",
progname, testdir, strerror(errno));
-
+#endif
for (; dir_num < num_dirs; num_files++, file_num++) {
if (file_num >= files_in_dir) {
if (dir_num == num_dirs - 1)
isatty_flag = isatty(STDOUT_FILENO);
if (!full) {
+#ifdef HAVE_EXT2FS_EXT2FS_H
struct mntent *tempmnt;
FILE *fp = NULL;
ext2_filsys fs;
num_dirs, fs->super->s_blocks_count,
fs->super->s_blocks_per_group);
ext2fs_close(fs);
+#else
+ goto guess;
+#endif
if (0) { /* ugh */
struct statfs64 statbuf;
guess: