From 5b144ed7bd89784bb1713a3020dc37d1aef72959 Mon Sep 17 00:00:00 2001 From: vitaly Date: Sun, 11 Jun 2006 16:54:36 +0000 Subject: [PATCH] Merge b1_5 from b1_4 (20060607_2142) --- .../patches/ext3-extents-2.6.12.patch | 10 +- .../patches/ext3-extents-2.6.5.patch | 10 +- .../patches/ext3-extents-2.6.9-rhel4.patch | 10 +- .../patches/ext3-mballoc2-2.6-suse.patch | 255 +++++--- .../patches/ext3-mballoc2-2.6.12.patch | 251 +++++--- .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 522 ++++++++------- .../patches/ext3-sector_t-overflow-2.6.12.patch | 64 ++ .../ext3-sector_t-overflow-2.6.5-suse.patch | 44 ++ .../ext3-sector_t-overflow-2.6.9-rhel4.patch | 64 ++ .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 1 + .../kernel_patches/series/ldiskfs-2.6-suse.series | 1 + .../series/ldiskfs-2.6.12-vanilla.series | 1 + lustre/ChangeLog | 64 +- lustre/autoMakefile.am | 19 +- lustre/autoconf/lustre-core.m4 | 5 +- lustre/doc/lfs.1 | 8 + lustre/doc/lfs.lyx | 16 + lustre/doc/lmc.1 | 2 +- lustre/doc/lmc.lyx | 3 +- lustre/include/linux/lustre_fsfilt.h | 23 +- lustre/include/linux/lustre_types.h | 4 +- lustre/include/lprocfs_status.h | 6 + lustre/include/lustre_dlm.h | 34 +- lustre/include/obd.h | 89 ++- lustre/include/obd_class.h | 213 +++++- lustre/include/obd_ost.h | 10 +- .../patches/ext3-extents-2.4.21-chaos.patch | 4 +- .../patches/ext3-extents-2.4.21-suse2.patch | 4 +- .../patches/ext3-extents-2.4.24.patch | 4 +- .../patches/ext3-extents-2.4.29.patch | 4 +- .../patches/ext3-extents-2.6.12.patch | 10 +- .../patches/ext3-extents-2.6.5.patch | 10 +- .../patches/ext3-extents-2.6.9-rhel4.patch | 10 +- .../patches/ext3-mballoc2-2.6-suse.patch | 255 +++++--- .../patches/ext3-mballoc2-2.6.12.patch | 251 +++++--- .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 522 ++++++++------- .../patches/ext3-sector_t-overflow-2.4.patch | 41 ++ .../patches/ext3-sector_t-overflow-2.6.12.patch | 64 ++ .../ext3-sector_t-overflow-2.6.5-suse.patch | 44 ++ .../ext3-sector_t-overflow-2.6.9-rhel4.patch | 64 ++ lustre/kernel_patches/series/hp-pnnl-2.4.20 | 1 + .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 1 + .../kernel_patches/series/ldiskfs-2.6-suse.series | 1 + .../series/ldiskfs-2.6.12-vanilla.series | 1 + lustre/kernel_patches/series/rhel-2.4.21 | 1 + lustre/kernel_patches/series/suse-2.4.21-cray | 1 + lustre/kernel_patches/series/suse-2.4.21-jvn | 31 - lustre/kernel_patches/series/vanilla-2.4.24 | 1 + lustre/kernel_patches/series/vanilla-2.4.29 | 1 + lustre/kernel_patches/series/vanilla-2.4.29-uml | 1 + lustre/kernel_patches/which_patch | 1 - lustre/ldlm/ldlm_lockd.c | 14 +- lustre/ldlm/ldlm_request.c | 288 +++++---- lustre/liblustre/file.c | 19 +- lustre/liblustre/rw.c | 44 +- lustre/liblustre/super.c | 31 +- lustre/llite/dir.c | 32 +- lustre/llite/file.c | 59 +- lustre/llite/llite_lib.c | 46 +- lustre/llite/lproc_llite.c | 56 +- lustre/llite/rw.c | 38 +- lustre/llite/rw24.c | 22 +- lustre/llite/rw26.c | 152 ++++- lustre/lov/lov_internal.h | 69 +- lustre/lov/lov_merge.c | 10 +- lustre/lov/lov_obd.c | 558 +++++++++------- lustre/lov/lov_pack.c | 2 +- lustre/lov/lov_qos.c | 43 +- lustre/lov/lov_request.c | 711 ++++++++++++++------- lustre/lvfs/fsfilt_ext3.c | 30 +- lustre/mdc/mdc_locks.c | 7 +- lustre/mds/handler.c | 19 +- lustre/mds/mds_open.c | 92 +-- lustre/mds/mds_reint.c | 86 +-- lustre/mgc/mgc_request.c | 5 +- lustre/mgs/mgs_handler.c | 9 +- lustre/obdclass/genops.c | 4 +- lustre/obdclass/lprocfs_status.c | 128 +++- lustre/obdclass/obd_config.c | 2 - lustre/obdecho/echo.c | 35 +- lustre/obdecho/echo_client.c | 47 +- lustre/obdfilter/filter.c | 75 ++- lustre/obdfilter/filter_internal.h | 9 +- lustre/obdfilter/filter_io.c | 22 +- lustre/obdfilter/filter_io_24.c | 6 +- lustre/obdfilter/filter_io_26.c | 15 +- lustre/obdfilter/filter_log.c | 28 +- lustre/osc/lproc_osc.c | 20 +- lustre/osc/osc_request.c | 502 ++++++++++----- lustre/ost/ost_handler.c | 51 +- lustre/tests/rundbench | 4 +- lustre/tests/runvmstat | 18 +- lustre/tests/sanity.sh | 6 +- lustre/tests/test-framework.sh | 4 + lustre/utils/Makefile.am | 17 +- lustre/utils/llverdev.c | 7 +- lustre/utils/llverfs.c | 14 +- 97 files changed, 4311 insertions(+), 2202 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.4.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch delete mode 100644 lustre/kernel_patches/series/suse-2.4.21-jvn diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch index 520c031..2c65544 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -2523,26 +2523,30 @@ Index: linux-2.6.12-rc6/fs/ext3/super.c Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -644,6 +647,8 @@ +@@ -644,6 +647,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -953,6 +958,12 @@ +@@ -953,6 +958,15 @@ case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch index f829621..be0642f 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -2512,26 +2512,30 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c Opt_ignore, Opt_barrier, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -582,6 +585,8 @@ +@@ -582,6 +585,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -797,6 +802,12 @@ +@@ -797,6 +802,15 @@ break; case Opt_ignore: break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 993b237..def228e 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2507,26 +2507,30 @@ Index: linux-stage/fs/ext3/super.c Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -639,6 +644,8 @@ +@@ -639,6 +644,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -943,6 +950,12 @@ +@@ -943,6 +950,15 @@ match_int(&args[0], &option); *n_blocks_count = option; break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 33dc268..e0ee12f 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -31,8 +31,8 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs.h extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); -+extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long, -+ unsigned long); ++extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, ++ unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, @@ -74,13 +74,13 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { +@@ -78,6 +84,43 @@ struct ext3_sb_info { struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -91,6 +91,7 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -111,6 +112,10 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.5-7.252-full/fs/ext3/super.c @@ -125,29 +130,40 @@ Index: linux-2.6.5-7.252-full/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -545,7 +546,7 @@ enum { - Opt_ignore, Opt_barrier, +@@ -545,6 +546,7 @@ enum { Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -591,6 +592,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -591,6 +592,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -813,6 +815,9 @@ static int parse_options (char * options +@@ -813,6 +815,19 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; + break; default: printk (KERN_ERR @@ -334,7 +350,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c =================================================================== --- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 +++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400 -@@ -0,0 +1,2616 @@ +@@ -0,0 +1,2703 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -469,10 +485,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -639,7 +655,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -820,7 +836,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -835,9 +851,9 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -892,11 +908,11 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -929,7 +945,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -1005,14 +1021,14 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1367,7 +1383,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1384,6 +1400,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1392,9 +1410,18 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len) { ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1496,11 +1523,46 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1629,6 +1691,13 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); + if (i >= ext3_mb_order2_reqs) { @@ -1653,7 +1722,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1681,6 +1750,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1694,7 +1765,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1739,8 +1810,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1778,7 +1848,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1847,7 +1917,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1972,7 +2042,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -1996,10 +2066,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2048,7 +2118,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2091,10 +2161,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2181,21 +2251,40 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2207,30 +2296,42 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2272,7 +2373,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2309,8 +2410,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2324,11 +2425,13 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2622,7 +2725,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2706,11 +2809,11 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 0297609..eade9a8 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -71,13 +71,13 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { +@@ -78,6 +84,43 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -88,6 +88,7 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -108,6 +109,10 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.12.6-bull/fs/ext3/super.c @@ -122,29 +127,40 @@ Index: linux-2.6.12.6-bull/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -597,7 +598,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -597,6 +598,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -650,6 +651,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -650,6 +651,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -965,6 +967,9 @@ clear_qf_name: +@@ -965,6 +967,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; + break; default: printk (KERN_ERR @@ -329,7 +345,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c =================================================================== --- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 +++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 -@@ -0,0 +1,2615 @@ +@@ -0,0 +1,2702 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -464,10 +480,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -634,7 +650,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -815,7 +831,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -830,9 +846,9 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -887,11 +903,11 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -924,7 +940,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -1000,14 +1016,14 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1362,7 +1378,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1379,6 +1395,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1387,9 +1405,18 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len) { ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1491,11 +1518,46 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1624,6 +1686,13 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); + if (i >= ext3_mb_order2_reqs) { @@ -1648,7 +1717,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1676,6 +1745,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1689,7 +1760,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1734,8 +1805,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1773,7 +1843,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1842,7 +1912,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1967,7 +2037,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -1991,10 +2061,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2043,7 +2113,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2086,10 +2156,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2176,21 +2246,40 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2202,30 +2291,42 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2267,7 +2368,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2304,8 +2405,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2319,11 +2420,13 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2617,7 +2720,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2700,11 +2803,11 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index ced267d..43fc776 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,7 +1,61 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400 +--- linux-stage.orig/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-05-25 10:59:14.000000000 -0600 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -18,13 +72,13 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -81,6 +87,39 @@ struct ext3_sb_info { +@@ -81,6 +87,43 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -56,67 +110,17 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 + - /* - * Special inodes numbers - */ -@@ -365,6 +373,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.9-full/fs/ext3/super.c + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block +--- linux-stage.orig/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600 +@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -124,30 +128,33 @@ Index: linux-2.6.9-full/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -596,7 +597,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -597,6 +598,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -648,6 +649,8 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -649,6 +651,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, + {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -958,6 +961,16 @@ clear_qf_name: +@@ -962,6 +967,19 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); + break; + case Opt_stripe: + if (match_int(&args[0], &option)) @@ -159,7 +166,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super +@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -167,7 +174,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c return 0; -@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t +@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -182,7 +189,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void) +@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -190,10 +197,10 @@ Index: linux-2.6.9-full/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.9-full/fs/ext3/extents.c +Index: linux-stage/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400 +--- linux-stage.orig/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600 @@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) @@ -235,23 +242,97 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9-full/fs/ext3/Makefile +Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o +--- linux-stage.orig/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.9-full/fs/ext3/xattr.c +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-stage/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400 +--- linux-stage.orig/fs/ext3/balloc.c 2006-05-25 10:36:02.000000000 -0600 ++++ linux-stage/fs/ext3/balloc.c 2006-05-25 10:36:04.000000000 -0600 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -451,24 +451,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1131,7 +1113,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600 @@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -279,11 +360,11 @@ Index: linux-2.6.9-full/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.9-full/fs/ext3/mballoc.c +Index: linux-stage/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400 -@@ -0,0 +1,2671 @@ +--- linux-stage.orig/fs/ext3/mballoc.c 2006-05-23 17:33:37.579436680 -0600 ++++ linux-stage/fs/ext3/mballoc.c 2006-05-25 10:59:14.000000000 -0600 +@@ -0,0 +1,2702 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -418,10 +499,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -588,7 +669,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -769,7 +850,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -784,9 +865,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -841,11 +922,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -878,7 +959,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -954,14 +1035,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1316,7 +1397,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1343,12 +1424,12 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); ++ ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + unsigned long start; + start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); + if (start % sbi->s_stripe == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; @@ -1461,7 +1542,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + * we try to find stripe-aligned chunks for stripe-size requests + */ +static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -1495,8 +1576,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1627,7 +1707,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + if (*len == 1 && sbi->s_stripe) { + /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached ++ * move all metadata in first groups in hope to hit cached + * sectors and thus avoid read-modify cycles in raid5 */ + ac.ac_g_ex.fe_group = group = 0; + } @@ -1656,7 +1736,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1699,7 +1779,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1744,8 +1824,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1783,7 +1862,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1852,7 +1931,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1977,7 +2056,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -2001,10 +2080,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2053,7 +2132,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2096,10 +2175,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2186,21 +2265,40 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2212,30 +2310,42 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2277,7 +2387,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2314,8 +2424,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2329,11 +2439,13 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2627,7 +2739,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2710,11 +2822,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2955,90 +3067,16 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -451,24 +451,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9-full/fs/ext3/inode.c +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } +--- linux-stage.orig/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600 +@@ -6,7 +6,7 @@ -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o - if (parent_bh) { - /* + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch new file mode 100644 index 0000000..ef0f4a4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - +_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch new file mode 100644 index 0000000..fe655da --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch @@ -0,0 +1,44 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch new file mode 100644 index 0000000..9bfdf80 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index 3661023..ea1389d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -11,3 +11,4 @@ ext3-mballoc2-2.6.9-rhel4.patch ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.9-rhel4.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series index efa7700..8fbb715 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series @@ -13,3 +13,4 @@ ext3-rename-reserve-2.6-suse.patch ext3-htree-dot-2.6.5-suse.patch ext3-ialloc-2.6.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.5-suse.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series index b44e35e..53c060b 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -12,3 +12,4 @@ ext3-remove-cond_resched-calls-2.6.12.patch ext3-htree-dot-2.6.patch ext3-external-journal-2.6.12.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.12.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index b630b73..a628800 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -60,15 +60,12 @@ Details : Clients can be started with a list of OSTs that should be tbd Cluster File Systems, Inc. * version 1.4.7 + * Support for kernels: + 2.6.9-34.EL (RHEL 4) + 2.6.5-7.252 (SLES 9) + 2.6.12.6 vanilla (kernel.org) * bug fixes -Severity : enhancement -Bugzilla : 9292 -Description: Getattr by fid -Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP - locks, avoids extra getattr rpc requests to MDS, allows '/' to - have locks and avoids getattr rpc requests for it on every stat. - Severity : major Frequency : rare Bugzilla : 5719, 9635, 9792, 9684, @@ -304,11 +301,54 @@ Details : In lustre/contrib/ or /usr/share/lustre in RPM a patch for Severity : minor Frequency : Always Bugzilla : 9486 -Description: extended inode attributes work improperly for the case of 2.4/2.6 - kernels used on client/server or the other way around. +Description: extended inode attributes (immutable, append-only) work improperly + when 2.4 and 2.6 kernels are used on client/server or vice versa Details : Introduce kernel-independent values for these flags. +Severity : enhancement +Frequency : Always +Bugzilla : 10248 +Description: Allow fractional MB tunings for lustre in /proc/ filesystem. +Details : Many of the /proc/ tunables can only be tuned at a megabyte + granularity. Now, Fractional MB granularity is be supported, + this is very useful for low memory system. + +Severity : enhancement +Bugzilla : 9292 +Description: Getattr by fid +Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP + locks, avoids extra getattr rpc requests to MDS, allows '/' to + have locks and avoids getattr rpc requests for it on every stat. + +Severity : major +Frequency : Always, for filesystems larger than 2TB +Bugzilla : 6191 +Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc +Details : Kenrel kmalloc limits allocations to 128kB and this prevents + filesystems larger than 2TB to be mounted with mballoc enabled. + +Severity : critical +Frequency : Always, for 32-bit kernel without CONFIG_LBD and filesystem > 2TB +Bugzilla : 6191 +Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc +Details : If a 32-bit kernel is compiled without CONFIG_LBD enabled and a + filesystems larger than 2TB is mounted then the kernel will + silently corrupt the start of the filesystem. CONFIG_LBD is + enabled for all CFS-supported kernels, but the possibility of + this happening with a modified kernel config exists. +Severity : enhancement +Bugzilla : 10462 +Description: add client O_DIRECT support for 2.6 kernels +Details : It is now possible to do O_DIRECT reads and writes to files + in the Lustre client mountpoint on 2.6 kernel clients. + +Severity : enhancement +Bugzilla : 10446 +Description: parallel glimpse, setattr, statfs, punch, destroy requests +Details : Sends glimpse, setattr, statfs, punch, destroy requests to OSTs in + parallel, not waiting for response from every OST before sending + a rpc to the next OST. ------------------------------------------------------------------------------ 02-14-2006 Cluster File Systems, Inc. @@ -320,9 +360,9 @@ Details : Introduce kernel-independent values for these flags. this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 for details. * bug fixes - * Support for newer kernels: - 2.6.9-22.0.2.EL (RHEL 4), - 2.6.5-7.244 (SLES 9) - same as 1.4.5.2. + * Support for kernels: + 2.6.9-22.0.2.EL (RHEL 4) + 2.6.5-7.244 (SLES 9) 2.6.12.6 vanilla (kernel.org) diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index fad4aee..f0531e8 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -66,21 +66,4 @@ lustre_build_version: echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver cmp -s $(BUILD_VER_H) tmpver > tmpdiff 2> /dev/null && \ $(RM) tmpver tmpdiff || \ - mv tmpver $(BUILD_VER_H) - -CSTK=/tmp/checkstack -CSTKO=/tmp/checkstack.orig - -checkstack: - [ -f ${CSTK} -a ! -s ${CSTKO} ] && mv ${CSTK} ${CSTKO} || true - for i in ${SUBDIRS} lnet/klnds/*; do \ - MOD=$$i/`basename $$i`.o; \ - [ -f $$MOD ] && objdump -d $$MOD | perl tests/checkstack.pl; \ - done | sort -nr > ${CSTK} - [ -f ${CSTKO} ] && ! diff -u ${CSTKO} ${CSTK} || head -30 ${CSTK} - -checkstack-update: - [ -f ${CSTK} ] && mv ${CSTK} ${CSTKO} - -checkstack-clean: - rm -f ${CSTK} ${CSTKO} + mv -f tmpver $(BUILD_VER_H) diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index dae895f..f1d257b 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -641,6 +641,9 @@ AC_CHECK_FUNCS([inet_ntoa]) # llite/xattr.c AC_CHECK_HEADERS([linux/xattr_acl.h]) +# utils/llverfs.c +AC_CHECK_HEADERS([ext2fs/ext2fs.h]) + # use universal lustre headers # i.e: include/obd.h instead of include/linux/obd.h AC_CHECK_FILE($PWD/lustre/include/obd.h, [AC_DEFINE(UNIV_LUSTRE_HEADERS, 1, [Use universal lustre headers])]) @@ -671,7 +674,7 @@ AM_CONDITIONAL(CLIENT, test x$enable_client = xyes) AM_CONDITIONAL(SERVER, test x$enable_server = xyes) AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes) AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes) -AM_CONDITIONAL(EXT2FS, test x$ac_cv_header_ext2fs_ext2fs_h = xyes) +AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes) ]) # diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1 index b31f5ff..2601a64 100644 --- a/lustre/doc/lfs.1 +++ b/lustre/doc/lfs.1 @@ -26,6 +26,8 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the .B lfs setstripe .br .B lfs check +.br +.B lfs df [-i] [-h] [path] .SH DESCRIPTION .B lfs can be used to create a new file with a specific striping pattern, determine the default striping pattern, gather the extended attributes (object numbers and @@ -67,6 +69,9 @@ Display the status of MDS or OSTs (as specified in the command) or all the serve .B osts List all the OSTs for the filesystem .TP +.B df +Report filesystem disk space usage or inodes usage of each MDS/OSD. +.TP .B help Provides brief help on the various arguments .TP @@ -114,5 +119,8 @@ Check the status of all servers(mds, osts) .TP .B $lfs osts List all the OSTs +.TP +.B $lfs df -i +Lists inode consumpton per OST and MDS .SH BUGS None are known. diff --git a/lustre/doc/lfs.lyx b/lustre/doc/lfs.lyx index 8e08a86..d9e4889 100644 --- a/lustre/doc/lfs.lyx +++ b/lustre/doc/lfs.lyx @@ -108,6 +108,16 @@ setquota [-u|-g] \series bold lfs\SpecialChar ~ quota [-o obd_uuid] [-u|-g] +\layout Standard + +\series bold +lfs\SpecialChar ~ +df [-i] [-h] [path] +\layout Standard + +\series bold +lfs\SpecialChar ~ +help \layout Subsection DESCRIPTION @@ -215,6 +225,12 @@ quota \layout List \labelwidthstring 00.00.0000 +\series bold +df +\series default + Report filesystem disk space usage or inodes usage of each MDS/OSD. +\layout List +\labelwidthstring 00.00.0000 \series bold help diff --git a/lustre/doc/lmc.1 b/lustre/doc/lmc.1 index d755de8..9197ead 100644 --- a/lustre/doc/lmc.1 +++ b/lustre/doc/lmc.1 @@ -207,7 +207,7 @@ Optional argument to mount fs. Mount options will be passed by this argument. Fo Optional arguement to specify the journal size for the ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext3 filesystem will be configured with a journal size dependent upon how large the filesystem is. .PP .B --add mtpt -Creates a mount-point on the specified node. Either an LOV or OSC name can be used. +Creates a mount-point on the specified node for the given LOV. .TP --node node Node that will use the mtpt. diff --git a/lustre/doc/lmc.lyx b/lustre/doc/lmc.lyx index e42e64b..48fcb4b 100644 --- a/lustre/doc/lmc.lyx +++ b/lustre/doc/lmc.lyx @@ -504,8 +504,7 @@ mkfs \layout Description --add\SpecialChar ~ -mtpt Creates a mount-point on the specified node. - Either an LOV or OSC name can be used. +mtpt Creates a mount-point on the specified node for the given LOV. \begin_deeper \layout Description diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 84e9af9..ce99320 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -154,16 +154,19 @@ static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb) #define FSFILT_OP_JOIN 11 #define FSFILT_OP_NOOP 15 -#define fsfilt_check_slow(start, timeout, msg) \ +#define fsfilt_check_slow(obd, start, timeout, msg) \ do { \ if (time_before(jiffies, start + 15 * HZ)) \ break; \ else if (time_before(jiffies, start + 30 * HZ)) \ - CDEBUG(D_VFSTRACE,"slow %s %lus\n", msg,(jiffies-start)/HZ);\ + CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \ + msg, (jiffies-start) / HZ); \ else if (time_before(jiffies, start + timeout / 2 * HZ)) \ - CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ); \ + CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ else \ - CERROR("slow %s %lus\n", msg, (jiffies - start) / HZ); \ + CERROR("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ } while (0) static inline void *fsfilt_start_log(struct obd_device *obd, @@ -189,7 +192,7 @@ static inline void *fsfilt_start_log(struct obd_device *obd, LBUG(); } } - fsfilt_check_slow(now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, obd_timeout, "journal start"); return handle; } @@ -224,7 +227,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount, LBUG(); } } - fsfilt_check_slow(now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, obd_timeout, "journal start"); return handle; } @@ -244,7 +247,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); CDEBUG(D_INFO, "committing handle %p\n", handle); - fsfilt_check_slow(now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, obd_timeout, "journal start"); return rc; } @@ -257,7 +260,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd, int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle); CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle); - fsfilt_check_slow(now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, obd_timeout, "journal start"); return rc; } @@ -268,7 +271,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd, unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit_wait(inode, handle); CDEBUG(D_INFO, "waiting for completion %p\n", handle); - fsfilt_check_slow(now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, obd_timeout, "journal start"); return rc; } @@ -278,7 +281,7 @@ static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry, unsigned long now = jiffies; int rc; rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc); - fsfilt_check_slow(now, obd_timeout, "setattr"); + fsfilt_check_slow(obd, now, obd_timeout, "setattr"); return rc; } diff --git a/lustre/include/linux/lustre_types.h b/lustre/include/linux/lustre_types.h index f99051b..99890fc 100644 --- a/lustre/include/linux/lustre_types.h +++ b/lustre/include/linux/lustre_types.h @@ -18,8 +18,8 @@ #endif #endif -#if (!defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \ - !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H)) && \ +#if !defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \ + !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H) && \ !defined(_ASM_IA64_TYPES_H) && !defined(_X86_64_TYPES_H) && \ !defined(_PPC_TYPES_H) && !defined(_PPC64_TYPES_H) /* yuck, would be nicer with _ASM_TYPES_H */ diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index c6b8005..c34ea2d 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -248,8 +248,14 @@ extern int lprocfs_rd_filegroups(char *page, char **start, off_t off, extern int lprocfs_write_helper(const char *buffer, unsigned long count, int *val); +extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult); +extern int lprocfs_read_frac_helper(char *buffer, unsigned long count, + long val, int mult); extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count, __u64 *val); +extern int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, + __u64 *val, int mult); int lprocfs_obd_seq_create(struct obd_device *dev, char *name, mode_t mode, struct file_operations *seq_fops, void *data); void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 9298f31..c08a379 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -549,30 +549,34 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag); int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data); -int ldlm_cli_enqueue(struct obd_export *exp, - struct ptlrpc_request *req, - struct ldlm_namespace *ns, - struct ldlm_res_id, - ldlm_type_t type, - ldlm_policy_data_t *, - ldlm_mode_t mode, - int *flags, +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **req, + struct ldlm_res_id res_id, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, int *flags, ldlm_blocking_callback blocking, ldlm_completion_callback completion, ldlm_glimpse_callback glimpse, - void *data, - void *lvb, - __u32 lvb_len, - void *lvb_swabber, - struct lustre_handle *lockh); + void *data, void *lvb, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh, int async); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + int *flags, void *lvb, __u32 lvb_len, + void *lvb_swabber, struct lustre_handle *lockh, + int rc); +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, struct ldlm_res_id res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, int *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh); int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, void *data, __u32 data_len); int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags); int ldlm_cli_cancel(struct lustre_handle *lockh); int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *, int flags, void *opaque); -int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *, - int join); +int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *, int join); /* mds/handler.c */ /* This has to be here because recursive inclusion sucks. */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index a3df3ad..ac22989 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -125,6 +125,50 @@ struct lov_stripe_md { #define lsm_pattern lsm_wire.lw_pattern #define lsm_stripe_count lsm_wire.lw_stripe_count +struct obd_info; + +typedef int (*obd_enqueue_update_f)(struct obd_info *oinfo, int rc); + +/* obd_enqueue parameters common for all levels (lov, osc). */ +struct obd_enqueue_info { + /* Flags used while lock handling. */ + int ei_flags; + /* Type of the lock being enqueued. */ + __u32 ei_type; + /* Mode of the lock being enqueued. */ + __u32 ei_mode; + /* Different callbacks for lock handling (blocking, completion, + glimpse */ + void *ei_cb_bl; + void *ei_cb_cp; + void *ei_cb_gl; + /* Data to be passed into callbacks. */ + void *ei_cbdata; + /* Request set for OSC async requests. */ + struct ptlrpc_request_set *ei_rqset; +}; + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* Lock policy. It keeps an extent which is specific for a particular + * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy, + * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */ + ldlm_policy_data_t oi_policy; + /* Lock handle specific for every OSC lock. */ + struct lustre_handle *oi_lockh; + /* lsm data specific for every OSC. */ + struct lov_stripe_md *oi_md; + /* obdo data specific for every OSC, if needed at all. */ + struct obdo *oi_oa; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every recieved + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; +}; + /* compare all relevant fields. */ static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1, struct lov_stripe_md *m2) @@ -805,6 +849,9 @@ struct obd_ops { int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs, cfs_time_t max_age); + int (*o_statfs_async)(struct obd_device *obd, struct obd_info *oinfo, + unsigned long max_age, + struct ptlrpc_request_set *set); int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt, struct lov_stripe_md *mem_src); int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt, @@ -818,22 +865,21 @@ struct obd_ops { int (*o_destroy)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, struct obd_export *md_exp); - int (*o_setattr)(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti); - int (*o_setattr_async)(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti); - int (*o_getattr)(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea); - int (*o_getattr_async)(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, + int (*o_setattr)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti); + int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_getattr)(struct obd_export *exp, struct obd_info *oinfo); + int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo, struct ptlrpc_request_set *set); - int (*o_brw)(int rw, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pgarr, struct obd_trans_info *oti); - int (*o_brw_async)(int rw, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pgarr, struct ptlrpc_request_set *, - struct obd_trans_info *oti); + int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pgarr, + struct obd_trans_info *oti); + int (*o_brw_async)(int rw, struct obd_export *exp, + struct obd_info *oinfo, obd_count oa_bufs, + struct brw_page *pgarr, struct obd_trans_info *oti, + struct ptlrpc_request_set *); int (*o_prep_async_page)(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_oinfo *loi, @@ -866,9 +912,9 @@ struct obd_ops { struct ost_lvb *lvb, int kms_only); int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, obd_off size, int shrink); - int (*o_punch)(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_size start, - obd_size end, struct obd_trans_info *oti); + int (*o_punch)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); int (*o_sync)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, obd_size start, obd_size end); int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst, @@ -888,11 +934,8 @@ struct obd_ops { int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, struct obd_trans_info *oti, int rc); - int (*o_enqueue)(struct obd_export *, struct lov_stripe_md *, - __u32 type, ldlm_policy_data_t *, __u32 mode, - int *flags, void *bl_cb, void *cp_cb, void *gl_cb, - void *data, __u32 lvb_len, void *lvb_swabber, - struct lustre_handle *lockh); + int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo, + struct obd_enqueue_info *einfo); int (*o_match)(struct obd_export *, struct lov_stripe_md *, __u32 type, ldlm_policy_data_t *, __u32 mode, int *flags, void *data, struct lustre_handle *lockh); diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index d78cb10..59edbce 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -467,8 +467,7 @@ static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo, RETURN(rc); } -static inline int obd_getattr(struct obd_export *exp, struct obdo *obdo, - struct lov_stripe_md *ea) +static inline int obd_getattr(struct obd_export *exp, struct obd_info *oinfo) { int rc; ENTRY; @@ -476,26 +475,25 @@ static inline int obd_getattr(struct obd_export *exp, struct obdo *obdo, EXP_CHECK_OP(exp, getattr); OBD_COUNTER_INCREMENT(exp->exp_obd, getattr); - rc = OBP(exp->exp_obd, getattr)(exp, obdo, ea); + rc = OBP(exp->exp_obd, getattr)(exp, oinfo); RETURN(rc); } static inline int obd_getattr_async(struct obd_export *exp, - struct obdo *obdo, struct lov_stripe_md *ea, + struct obd_info *oinfo, struct ptlrpc_request_set *set) { int rc; ENTRY; - EXP_CHECK_OP(exp, getattr); - OBD_COUNTER_INCREMENT(exp->exp_obd, getattr); + EXP_CHECK_OP(exp, getattr_async); + OBD_COUNTER_INCREMENT(exp->exp_obd, getattr_async); - rc = OBP(exp->exp_obd, getattr_async)(exp, obdo, ea, set); + rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set); RETURN(rc); } -static inline int obd_setattr(struct obd_export *exp, struct obdo *obdo, - struct lov_stripe_md *ea, +static inline int obd_setattr(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti) { int rc; @@ -504,22 +502,47 @@ static inline int obd_setattr(struct obd_export *exp, struct obdo *obdo, EXP_CHECK_OP(exp, setattr); OBD_COUNTER_INCREMENT(exp->exp_obd, setattr); - rc = OBP(exp->exp_obd, setattr)(exp, obdo, ea, oti); + rc = OBP(exp->exp_obd, setattr)(exp, oinfo, oti); RETURN(rc); } -static inline int obd_setattr_async(struct obd_export *exp, - struct obdo *obdo, - struct lov_stripe_md *ea, +/* This performs all the requests set init/wait/destroy actions. */ +static inline int obd_setattr_rqset(struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) { + struct ptlrpc_request_set *set = NULL; int rc; ENTRY; EXP_CHECK_OP(exp, setattr_async); OBD_COUNTER_INCREMENT(exp->exp_obd, setattr_async); - rc = OBP(exp->exp_obd, setattr_async)(exp, obdo, ea, oti); + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* This adds all the requests into @set if @set != NULL, otherwise + all requests are sent asynchronously without waiting for response. */ +static inline int obd_setattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_OP(exp, setattr_async); + OBD_COUNTER_INCREMENT(exp->exp_obd, setattr_async); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); RETURN(rc); } @@ -639,6 +662,58 @@ obd_lvfs_fid2dentry(struct obd_export *exp, __u64 id_ino, __u32 gen, __u64 gr) /* @max_age is the oldest time in jiffies that we accept using a cached data. * If the cache is older than @max_age we will get a new value from the * target. Use a value of "jiffies + HZ" to guarantee freshness. */ +static inline int obd_statfs_async(struct obd_device *obd, + struct obd_info *oinfo, + unsigned long max_age, + struct ptlrpc_request_set *rqset) +{ + int rc = 0; + ENTRY; + + if (obd == NULL) + RETURN(-EINVAL); + + OBD_CHECK_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "osfs %lu, max_age %lu\n", obd->obd_osfs_age, max_age); + if (time_before(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs_async)(obd, oinfo, max_age, rqset); + } else { + CDEBUG(D_SUPER, "using cached obd_statfs data\n"); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + } + RETURN(rc); +} + +static inline int obd_statfs_rqset(struct obd_device *obd, + struct obd_statfs *osfs, + unsigned long max_age) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + oinfo.oi_osfs = osfs; + rc = obd_statfs_async(obd, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "jiffies + HZ" to guarantee freshness. */ static inline int obd_statfs(struct obd_device *obd, struct obd_statfs *osfs, cfs_time_t max_age) { @@ -684,22 +759,44 @@ static inline int obd_sync(struct obd_export *exp, struct obdo *oa, RETURN(rc); } -static inline int obd_punch(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_size start, - obd_size end, struct obd_trans_info *oti) +static inline int obd_punch_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) { + struct ptlrpc_request_set *set = NULL; int rc; ENTRY; EXP_CHECK_OP(exp, punch); OBD_COUNTER_INCREMENT(exp->exp_obd, punch); - rc = OBP(exp->exp_obd, punch)(exp, oa, ea, start, end, oti); + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, punch)(exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); RETURN(rc); } -static inline int obd_brw(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_count oa_bufs, +static inline int obd_punch(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + int rc; + ENTRY; + + EXP_CHECK_OP(exp, punch); + OBD_COUNTER_INCREMENT(exp->exp_obd, punch); + + rc = OBP(exp->exp_obd, punch)(exp, oinfo, oti, rqset); + RETURN(rc); +} + +static inline int obd_brw(int cmd, struct obd_export *exp, + struct obd_info *oinfo, obd_count oa_bufs, struct brw_page *pg, struct obd_trans_info *oti) { int rc; @@ -714,15 +811,14 @@ static inline int obd_brw(int cmd, struct obd_export *exp, struct obdo *oa, LBUG(); } - rc = OBP(exp->exp_obd, brw)(cmd, exp, oa, ea, oa_bufs, pg, oti); + rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti); RETURN(rc); } static inline int obd_brw_async(int cmd, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *ea, - obd_count oa_bufs, struct brw_page *pg, - struct ptlrpc_request_set *set, - struct obd_trans_info *oti) + struct obd_info *oinfo, obd_count oa_bufs, + struct brw_page *pg, struct obd_trans_info *oti, + struct ptlrpc_request_set *set) { int rc; ENTRY; @@ -735,8 +831,36 @@ static inline int obd_brw_async(int cmd, struct obd_export *exp, LBUG(); } - rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oa, ea, oa_bufs, pg, set, - oti); + rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oinfo, oa_bufs, pg,oti,set); + RETURN(rc); +} + +static inline int obd_brw_rqset(int cmd, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *lsm, + obd_count oa_bufs, struct brw_page *pg, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + rc = obd_brw_async(cmd, exp, &oinfo, oa_bufs, pg, oti, set); + if (rc == 0) { + rc = ptlrpc_set_wait(set); + if (rc) + CERROR("error from callback: rc = %d\n", rc); + } else { + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "error from obd_brw_async: rc = %d\n", rc); + } + ptlrpc_set_destroy(set); RETURN(rc); } @@ -915,11 +1039,32 @@ static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, RETURN(rc); } -static inline int obd_enqueue(struct obd_export *exp, struct lov_stripe_md *ea, - __u32 type, ldlm_policy_data_t *policy, - __u32 mode, int *flags, void *bl_cb, void *cp_cb, - void *gl_cb, void *data, __u32 lvb_len, - void *lvb_swabber, struct lustre_handle *lockh) +static inline int obd_enqueue_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_enqueue_info *einfo) +{ + int rc; + ENTRY; + + EXP_CHECK_OP(exp, enqueue); + OBD_COUNTER_INCREMENT(exp->exp_obd, enqueue); + + einfo->ei_rqset = ptlrpc_prep_set(); + if (einfo->ei_rqset == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo); + if (rc == 0) + rc = ptlrpc_set_wait(einfo->ei_rqset); + ptlrpc_set_destroy(einfo->ei_rqset); + einfo->ei_rqset = NULL; + + RETURN(rc); +} + +static inline int obd_enqueue(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_enqueue_info *einfo) { int rc; ENTRY; @@ -927,9 +1072,7 @@ static inline int obd_enqueue(struct obd_export *exp, struct lov_stripe_md *ea, EXP_CHECK_OP(exp, enqueue); OBD_COUNTER_INCREMENT(exp->exp_obd, enqueue); - rc = OBP(exp->exp_obd, enqueue)(exp, ea, type, policy, mode, flags, - bl_cb, cp_cb, gl_cb, data, lvb_len, - lvb_swabber, lockh); + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo); RETURN(rc); } diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h index deb963b..48526ad 100644 --- a/lustre/include/obd_ost.h +++ b/lustre/include/obd_ost.h @@ -23,8 +23,14 @@ struct osc_brw_async_args { struct list_head aa_oaps; }; -struct osc_getattr_async_args { - struct obdo *aa_oa; +struct osc_async_args { + struct obd_info *aa_oi; +}; + +struct osc_enqueue_args { + struct obd_export *oa_exp; + struct obd_info *oa_oi; + struct obd_enqueue_info *oa_ei; }; #endif diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch index 0d9a5b8..8f4c675 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch @@ -2478,12 +2478,14 @@ Index: linux-2.4.21-rhel/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -755,6 +756,10 @@ +@@ -755,6 +756,12 @@ return 0; } } + else if (!strcmp (this_char, "extents")) + set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "noextents")) ++ clear_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "extdebug")) + set_opt (*mount_options, EXTDEBUG); else if (!strcmp (this_char, "grpid") || diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch index 374bae9..1031470 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch @@ -2477,12 +2477,14 @@ Index: linux-2.4.21-suse2/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -733,6 +734,10 @@ +@@ -733,6 +734,12 @@ return 0; } } + else if (!strcmp (this_char, "extents")) + set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "noextents")) ++ clear_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "extdebug")) + set_opt (*mount_options, EXTDEBUG); else if (!strcmp (this_char, "grpid") || diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch index 8d4de9c..9ee2417 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch @@ -2465,12 +2465,14 @@ Index: linux-2.4.24/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -704,6 +705,10 @@ +@@ -704,6 +705,12 @@ return 0; } } + else if (!strcmp (this_char, "extents")) + set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "noextents")) ++ clear_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "extdebug")) + set_opt (*mount_options, EXTDEBUG); else if (!strcmp (this_char, "grpid") || diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch index 84b9a12..786ccd6 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch @@ -2465,12 +2465,14 @@ Index: linux-2.4.29/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -702,6 +703,10 @@ +@@ -702,6 +703,12 @@ return 0; } } + else if (!strcmp (this_char, "extents")) + set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "noextents")) ++ clear_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "extdebug")) + set_opt (*mount_options, EXTDEBUG); else if (!strcmp (this_char, "grpid") || diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch index 520c031..2c65544 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -2523,26 +2523,30 @@ Index: linux-2.6.12-rc6/fs/ext3/super.c Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -644,6 +647,8 @@ +@@ -644,6 +647,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -953,6 +958,12 @@ +@@ -953,6 +958,15 @@ case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch index f829621..be0642f 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -2512,26 +2512,30 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c Opt_ignore, Opt_barrier, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -582,6 +585,8 @@ +@@ -582,6 +585,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -797,6 +802,12 @@ +@@ -797,6 +802,15 @@ break; case Opt_ignore: break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 993b237..def228e 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2507,26 +2507,30 @@ Index: linux-stage/fs/ext3/super.c Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -639,6 +644,8 @@ +@@ -639,6 +644,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -943,6 +950,12 @@ +@@ -943,6 +950,15 @@ match_int(&args[0], &option); *n_blocks_count = option; break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 33dc268..e0ee12f 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -31,8 +31,8 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs.h extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); -+extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long, -+ unsigned long); ++extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, ++ unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, @@ -74,13 +74,13 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { +@@ -78,6 +84,43 @@ struct ext3_sb_info { struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -91,6 +91,7 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -111,6 +112,10 @@ Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.5-7.252-full/fs/ext3/super.c @@ -125,29 +130,40 @@ Index: linux-2.6.5-7.252-full/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -545,7 +546,7 @@ enum { - Opt_ignore, Opt_barrier, +@@ -545,6 +546,7 @@ enum { Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -591,6 +592,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -591,6 +592,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -813,6 +815,9 @@ static int parse_options (char * options +@@ -813,6 +815,19 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; + break; default: printk (KERN_ERR @@ -334,7 +350,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c =================================================================== --- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 +++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400 -@@ -0,0 +1,2616 @@ +@@ -0,0 +1,2703 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -469,10 +485,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -639,7 +655,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -820,7 +836,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -835,9 +851,9 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -892,11 +908,11 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -929,7 +945,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -1005,14 +1021,14 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1367,7 +1383,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1384,6 +1400,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1392,9 +1410,18 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len) { ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1496,11 +1523,46 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1629,6 +1691,13 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); + if (i >= ext3_mb_order2_reqs) { @@ -1653,7 +1722,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1681,6 +1750,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1694,7 +1765,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1739,8 +1810,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1778,7 +1848,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1847,7 +1917,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1972,7 +2042,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -1996,10 +2066,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2048,7 +2118,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2091,10 +2161,10 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2181,21 +2251,40 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2207,30 +2296,42 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2272,7 +2373,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2309,8 +2410,8 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2324,11 +2425,13 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2622,7 +2725,7 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2706,11 +2809,11 @@ Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 0297609..eade9a8 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -71,13 +71,13 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { +@@ -78,6 +84,43 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -88,6 +88,7 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -108,6 +109,10 @@ Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.12.6-bull/fs/ext3/super.c @@ -122,29 +127,40 @@ Index: linux-2.6.12.6-bull/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -597,7 +598,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -597,6 +598,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -650,6 +651,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -650,6 +651,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -965,6 +967,9 @@ clear_qf_name: +@@ -965,6 +967,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; + break; default: printk (KERN_ERR @@ -329,7 +345,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c =================================================================== --- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 +++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 -@@ -0,0 +1,2615 @@ +@@ -0,0 +1,2702 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -464,10 +480,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -634,7 +650,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -815,7 +831,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -830,9 +846,9 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -887,11 +903,11 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -924,7 +940,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -1000,14 +1016,14 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1362,7 +1378,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1379,6 +1395,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1387,9 +1405,18 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len) { ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1491,11 +1518,46 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1624,6 +1686,13 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); + if (i >= ext3_mb_order2_reqs) { @@ -1648,7 +1717,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1676,6 +1745,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1689,7 +1760,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1734,8 +1805,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1773,7 +1843,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1842,7 +1912,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1967,7 +2037,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -1991,10 +2061,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2043,7 +2113,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2086,10 +2156,10 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2176,21 +2246,40 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2202,30 +2291,42 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2267,7 +2368,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2304,8 +2405,8 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2319,11 +2420,13 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2617,7 +2720,7 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2700,11 +2803,11 @@ Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index ced267d..43fc776 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,7 +1,61 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400 +--- linux-stage.orig/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs.h 2006-05-25 10:36:04.000000000 -0600 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-05-25 10:59:14.000000000 -0600 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -18,13 +72,13 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -81,6 +87,39 @@ struct ext3_sb_info { +@@ -81,6 +87,43 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + + /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; ++ struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; @@ -56,67 +110,17 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 + - /* - * Special inodes numbers - */ -@@ -365,6 +373,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.9-full/fs/ext3/super.c + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block +--- linux-stage.orig/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/super.c 2006-05-25 10:36:04.000000000 -0600 +@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -124,30 +128,33 @@ Index: linux-2.6.9-full/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -596,7 +597,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -597,6 +598,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { -@@ -648,6 +649,8 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, +@@ -649,6 +651,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, + {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -958,6 +961,16 @@ clear_qf_name: +@@ -962,6 +967,19 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); + break; + case Opt_stripe: + if (match_int(&args[0], &option)) @@ -159,7 +166,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super +@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -167,7 +174,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c return 0; -@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t +@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -182,7 +189,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void) +@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -190,10 +197,10 @@ Index: linux-2.6.9-full/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.9-full/fs/ext3/extents.c +Index: linux-stage/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400 +--- linux-stage.orig/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/extents.c 2006-05-25 10:36:04.000000000 -0600 @@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) @@ -235,23 +242,97 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9-full/fs/ext3/Makefile +Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o +--- linux-stage.orig/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/inode.c 2006-05-25 10:36:04.000000000 -0600 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.9-full/fs/ext3/xattr.c +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-stage/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400 +--- linux-stage.orig/fs/ext3/balloc.c 2006-05-25 10:36:02.000000000 -0600 ++++ linux-stage/fs/ext3/balloc.c 2006-05-25 10:36:04.000000000 -0600 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -451,24 +451,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1131,7 +1113,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/xattr.c 2006-05-25 10:36:04.000000000 -0600 @@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -279,11 +360,11 @@ Index: linux-2.6.9-full/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.9-full/fs/ext3/mballoc.c +Index: linux-stage/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400 -@@ -0,0 +1,2671 @@ +--- linux-stage.orig/fs/ext3/mballoc.c 2006-05-23 17:33:37.579436680 -0600 ++++ linux-stage/fs/ext3/mballoc.c 2006-05-25 10:59:14.000000000 -0600 +@@ -0,0 +1,2702 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -418,10 +499,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + /* search goals */ + struct ext3_free_extent ac_g_ex; -+ ++ + /* the best found extent */ + struct ext3_free_extent ac_b_ex; -+ ++ + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; @@ -588,7 +669,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (mb_check_counter++ % 300 != 0) + return; + } -+ ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -769,7 +850,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ ++ + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; @@ -784,9 +865,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + memset(bh, 0, i); + } else + bh = &bhs; -+ ++ + first_group = page->index * blocks_per_page / 2; -+ ++ + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; @@ -841,11 +922,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ EXT3_GROUP_INFO(sb, group)); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -878,7 +959,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -954,14 +1035,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -1316,7 +1397,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ ++ + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); @@ -1343,12 +1424,12 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); ++ ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + unsigned long start; + start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); + if (start % sbi->s_stripe == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; @@ -1461,7 +1542,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + * we try to find stripe-aligned chunks for stripe-size requests + */ +static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -1495,8 +1576,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1627,7 +1707,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + if (*len == 1 && sbi->s_stripe) { + /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached ++ * move all metadata in first groups in hope to hit cached + * sectors and thus avoid read-modify cycles in raid5 */ + ac.ac_g_ex.fe_group = group = 0; + } @@ -1656,7 +1736,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1699,7 +1779,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far @@ -1744,8 +1824,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1783,7 +1862,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; @@ -1852,7 +1931,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ ++ + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); @@ -1977,7 +2056,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); -+ ++ + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; @@ -2001,10 +2080,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) @@ -2053,7 +2132,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); -+ memcpy(&sg, sbi->s_group_info[group], i); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) @@ -2096,10 +2175,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) @@ -2186,21 +2265,40 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -2212,30 +2310,42 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ goto err_out; ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_out; ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -2277,7 +2387,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ ++ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -2314,8 +2424,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ ++ int i, num_meta_group_infos; ++ + if (!test_opt(sb, MBALLOC)) + return 0; + @@ -2329,11 +2439,13 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -2627,7 +2739,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ ++ + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2710,11 +2822,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + return; +} + -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2955,90 +3067,16 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -451,24 +451,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9-full/fs/ext3/inode.c +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } +--- linux-stage.orig/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600 ++++ linux-stage/fs/ext3/Makefile 2006-05-25 10:36:04.000000000 -0600 +@@ -6,7 +6,7 @@ -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o - if (parent_bh) { - /* + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.4.patch b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.4.patch new file mode 100644 index 0000000..950ec9a --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.4.patch @@ -0,0 +1,41 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,14 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (unsigned long)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", ++ bdevname(sb->s_dev), le32_to_cpu(es->s_blocks_count)); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch new file mode 100644 index 0000000..ef0f4a4 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - +_ diff --git a/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch new file mode 100644 index 0000000..fe655da --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch @@ -0,0 +1,44 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch new file mode 100644 index 0000000..9bfdf80 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/lustre/kernel_patches/series/hp-pnnl-2.4.20 b/lustre/kernel_patches/series/hp-pnnl-2.4.20 index d846b4d..2545ff9 100644 --- a/lustre/kernel_patches/series/hp-pnnl-2.4.20 +++ b/lustre/kernel_patches/series/hp-pnnl-2.4.20 @@ -47,3 +47,4 @@ ext3-extents-2.4.24.patch ext3-extents-asyncdel-2.4.24.patch ext3-nlinks-2.4.20-hp_pnnl.patch export-zap-page-range.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series index 3661023..ea1389d 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -11,3 +11,4 @@ ext3-mballoc2-2.6.9-rhel4.patch ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.9-rhel4.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-suse.series b/lustre/kernel_patches/series/ldiskfs-2.6-suse.series index efa7700..8fbb715 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-suse.series @@ -13,3 +13,4 @@ ext3-rename-reserve-2.6-suse.patch ext3-htree-dot-2.6.5-suse.patch ext3-ialloc-2.6.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.5-suse.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series index b44e35e..53c060b 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -12,3 +12,4 @@ ext3-remove-cond_resched-calls-2.6.12.patch ext3-htree-dot-2.6.patch ext3-external-journal-2.6.12.patch ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.12.patch diff --git a/lustre/kernel_patches/series/rhel-2.4.21 b/lustre/kernel_patches/series/rhel-2.4.21 index dcaff40..5e6223e 100644 --- a/lustre/kernel_patches/series/rhel-2.4.21 +++ b/lustre/kernel_patches/series/rhel-2.4.21 @@ -52,3 +52,4 @@ fsprivate-2.4.patch nfsd_iallocsem.patch linux-2.4.24-jbd-handle-EIO-rhel3.patch ext3-lookup-dotdot-2.4.20.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-cray b/lustre/kernel_patches/series/suse-2.4.21-cray index e0b9c23..794642b 100644 --- a/lustre/kernel_patches/series/suse-2.4.21-cray +++ b/lustre/kernel_patches/series/suse-2.4.21-cray @@ -41,3 +41,4 @@ fsprivate-2.4-suse.patch nfsd_iallocsem.patch linux-2.4.24-jbd-handle-EIO.patch ext3-ialloc-2.4.21-suse2.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-jvn b/lustre/kernel_patches/series/suse-2.4.21-jvn deleted file mode 100644 index 74e9445..0000000 --- a/lustre/kernel_patches/series/suse-2.4.21-jvn +++ /dev/null @@ -1,31 +0,0 @@ -configurable-x86-stack-2.4.21-suse-171.patch -configurable-x86_64-2.4.21.patch -dev_read_only_2.4.20-rh.patch -exports_2.4.20-rh-hp.patch -lustre_version.patch -vfs_intent-2.4.21-suse-171.patch -invalidate_show.patch -export-truncate.patch -iod-stock-24-exports_hp.patch -ext3-htree-2.4.21-chaos.patch -linux-2.4.21-xattr-0.8.54-suse-171.patch -ext3-orphan_lock-2.4.22-rh.patch -ext3-noread-2.4.21-suse2.patch -ext3-delete_thread-2.4.21-suse-171.patch -extN-wantedi-2.4.21-suse2.patch -ext3-san-2.4.20.patch -ext3-map_inode_page-2.4.21-suse2.patch -ext3-error-export.patch -iopen-2.4.21-chaos.patch -jbd-dont-account-blocks-twice.patch -jbd-commit-tricks.patch -ext3-no-write-super-chaos.patch -add_page_private.patch -nfs_export_kernel-2.4.21-suse2.patch -ext3-raw-lookup.patch -ext3-ea-in-inode-2.4.21-sles.patch -listman-2.4.20.patch -ext3-truncate-buffer-head.patch -lookup-stack-symbols-2.4.21-suse-171.patch -fsprivate-2.4-suse.patch -nfsd_iallocsem.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.24 b/lustre/kernel_patches/series/vanilla-2.4.24 index a8ee3e0..5011ae1 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.24 +++ b/lustre/kernel_patches/series/vanilla-2.4.24 @@ -47,3 +47,4 @@ uml-exprt-clearuser.patch fsprivate-2.4.patch nfsd_iallocsem.patch linux-2.4.24-jbd-handle-EIO.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.29 b/lustre/kernel_patches/series/vanilla-2.4.29 index bb22e80..7c2e3d9 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.29 +++ b/lustre/kernel_patches/series/vanilla-2.4.29 @@ -43,3 +43,4 @@ kallsyms-2.4.29.patch fsprivate-2.4.patch nfsd_iallocsem.patch linux-2.4.24-jbd-handle-EIO.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.29-uml b/lustre/kernel_patches/series/vanilla-2.4.29-uml index 3bd5fd2..5ee53f9 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.29-uml +++ b/lustre/kernel_patches/series/vanilla-2.4.29-uml @@ -45,3 +45,4 @@ remove-suid-2.4-rhel.patch kallsyms-2.4.29.patch fsprivate-2.4.patch nfsd_iallocsem.patch +ext3-sector_t-overflow-2.4.patch diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index bc48f94..29121c2 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -19,4 +19,3 @@ NB - The patches in the ldiskfs series should not be applied to the kernel. UNSUPPORTED KERNELS; BEING PHASED OUT; MAY BE MISSING CRITICAL BUG FIXES: hp-pnnl-2.4.20 linux-2.4.20-hp4_pnnl1 same as vanilla but no uml ia64 vanilla-2.4.24 linux-2.4.24 patch with uml-2.4.24-6 um -suse-2.4.21-jvn linux-2.4.21-241 sles8 2.4 kernel i386 diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index f2392e5..95c44dc 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -58,6 +58,12 @@ inline cfs_time_t round_timeout(cfs_time_t timeout) return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1); } +/* timeout for initial callback (AST) reply */ +static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout, unsigned int obd_timeout) +{ + return max(min(ldlm_timeout, obd_timeout / 3), 1U); +} + #ifdef __KERNEL__ /* w_l_spinlock protects both waiting_locks_list and expired_lock_thread */ static spinlock_t waiting_locks_spinlock; @@ -536,7 +542,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, l_unlock(&lock->l_resource->lr_namespace->ns_lock); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */ + req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */ if (unlikely(instant_cancel)) { rc = ptl_send_rpc(req, 1); } else { @@ -609,7 +615,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) ptlrpc_req_set_repsize(req, 1, NULL); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */ + req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */ /* We only send real blocking ASTs after the lock is granted */ l_lock(&lock->l_resource->lr_namespace->ns_lock); @@ -673,7 +679,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) ptlrpc_req_set_repsize(req, 2, size); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */ + req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); /* timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); if (rc == -ELDLM_NO_LOCK_DATA) @@ -1795,6 +1801,8 @@ EXPORT_SYMBOL(ldlm_glimpse_ast); EXPORT_SYMBOL(ldlm_expired_completion_wait); EXPORT_SYMBOL(ldlm_cli_convert); EXPORT_SYMBOL(ldlm_cli_enqueue); +EXPORT_SYMBOL(ldlm_cli_enqueue_fini); +EXPORT_SYMBOL(ldlm_cli_enqueue_local); EXPORT_SYMBOL(ldlm_cli_cancel); EXPORT_SYMBOL(ldlm_cli_cancel_unused); EXPORT_SYMBOL(ldlm_cli_join_lru); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index d8945a9..ead2009 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -224,23 +224,20 @@ int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp) return -ELDLM_NO_LOCK_DATA; } -static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, - struct ldlm_res_id res_id, - __u32 type, - ldlm_policy_data_t *policy, - ldlm_mode_t mode, - int *flags, - ldlm_blocking_callback blocking, - ldlm_completion_callback completion, - ldlm_glimpse_callback glimpse, - void *data, __u32 lvb_len, - void *lvb_swabber, - struct lustre_handle *lockh) +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, struct ldlm_res_id res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, int *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) { struct ldlm_lock *lock; int err; ENTRY; + LASSERT(!(*flags & LDLM_FL_REPLAY)); if (ns->ns_client) { CERROR("Trying to enqueue local lock in a shadow namespace\n"); LBUG(); @@ -303,113 +300,21 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns, } } -int ldlm_cli_enqueue(struct obd_export *exp, - struct ptlrpc_request *req, - struct ldlm_namespace *ns, - struct ldlm_res_id res_id, - __u32 type, - ldlm_policy_data_t *policy, - ldlm_mode_t mode, - int *flags, - ldlm_blocking_callback blocking, - ldlm_completion_callback completion, - ldlm_glimpse_callback glimpse, - void *data, - void *lvb, - __u32 lvb_len, - void *lvb_swabber, - struct lustre_handle *lockh) +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + int *flags, void *lvb, __u32 lvb_len, + void *lvb_swabber, struct lustre_handle *lockh,int rc) { + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + int is_replay = *flags & LDLM_FL_REPLAY; struct ldlm_lock *lock; - struct ldlm_request *body; struct ldlm_reply *reply; - int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), - [DLM_LOCKREQ_OFF] = sizeof(*body), - [DLM_REPLY_REC_OFF] = lvb_len }; - int is_replay = *flags & LDLM_FL_REPLAY; - int req_passed_in = 1, cleanup_phase = 0, rc; + int cleanup_phase = 1; ENTRY; - if (exp == NULL) { - LASSERT(!is_replay); - rc = ldlm_cli_enqueue_local(ns, res_id, type, policy, mode, - flags, blocking, completion, - glimpse, data, lvb_len, lvb_swabber, - lockh); - RETURN(rc); - } - - /* If we're replaying this lock, just check some invariants. - * If we're creating a new lock, get everything all setup nice. */ - if (is_replay) { - lock = ldlm_handle2lock(lockh); - LDLM_DEBUG(lock, "client-side enqueue START"); - LASSERT(exp == lock->l_conn_export); - } else { - lock = ldlm_lock_create(ns, NULL, res_id, type, mode, blocking, - completion, glimpse, data, lvb_len); - if (lock == NULL) - RETURN(-ENOMEM); - /* for the local lock, add the reference */ - ldlm_lock_addref_internal(lock, mode); - ldlm_lock2handle(lock, lockh); - lock->l_lvb_swabber = lvb_swabber; - if (policy != NULL) { - /* INODEBITS_INTEROP: If the server does not support - * inodebits, we will request a plain lock in the - * descriptor (ldlm_lock2desc() below) but use an - * inodebits lock internally with both bits set. - */ - if (type == LDLM_IBITS && !(exp->exp_connect_flags & - OBD_CONNECT_IBITS)) - lock->l_policy_data.l_inodebits.bits = - MDS_INODELOCK_LOOKUP | - MDS_INODELOCK_UPDATE; - else - lock->l_policy_data = *policy; - } - - if (type == LDLM_EXTENT) - lock->l_req_extent = policy->l_extent; - LDLM_DEBUG(lock, "client-side enqueue START"); - } - - /* lock not sent to server yet */ - cleanup_phase = 2; - - if (req == NULL) { - req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, - LDLM_ENQUEUE, 2, size, NULL); - if (req == NULL) - GOTO(cleanup, rc = -ENOMEM); - req_passed_in = 0; - } else { - LASSERTF(lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF) == - sizeof(*body), "buflen[%d] = %d, not %d\n", - DLM_LOCKREQ_OFF, - lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF), - sizeof(*body)); - } - - lock->l_conn_export = exp; - lock->l_export = NULL; - lock->l_blocking_ast = blocking; - - /* Dump lock data into the request buffer */ - body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF, sizeof(*body)); - ldlm_lock2desc(lock, &body->lock_desc); - body->lock_flags = *flags; - body->lock_handle1 = *lockh; - - /* Continue as normal. */ - if (!req_passed_in) { - size[DLM_LOCKREPLY_OFF] = sizeof(*reply); - ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size); - - } - LDLM_DEBUG(lock, "sending request"); - rc = ptlrpc_queue_wait(req); - + lock = ldlm_handle2lock(lockh); + /* ldlm_cli_enqueue is holding a reference on this lock. */ + LASSERT(lock != NULL); if (rc != ELDLM_OK) { LASSERT(!is_replay); LDLM_DEBUG(lock, "client-side enqueue END (%s)", @@ -438,14 +343,6 @@ int ldlm_cli_enqueue(struct obd_export *exp, GOTO(cleanup, rc); } - /* - * Liblustre client doesn't get extent locks, except for O_APPEND case - * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where - * [i_size, OBD_OBJECT_EOF] lock is taken. - */ - LASSERT(ergo(LIBLUSTRE_CLIENT, type != LDLM_EXTENT || - policy->l_extent.end == OBD_OBJECT_EOF)); - reply = lustre_swab_repbuf(req, DLM_LOCKREPLY_OFF, sizeof(*reply), lustre_swab_ldlm_reply); if (reply == NULL) { @@ -454,7 +351,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, } /* lock enqueued on the server */ - cleanup_phase = 1; + cleanup_phase = 0; l_lock(&ns->ns_lock); lock->l_remote_handle = reply->lock_handle; @@ -495,7 +392,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, } LDLM_DEBUG(lock, "client-side enqueue, new resource"); } - if (policy != NULL) + if (with_policy) if (!(type == LDLM_IBITS && !(exp->exp_connect_flags & OBD_CONNECT_IBITS))) lock->l_policy_data = @@ -533,7 +430,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, if (!rc) rc = err; if (rc) - cleanup_phase = 2; + cleanup_phase = 1; } } @@ -546,17 +443,142 @@ int ldlm_cli_enqueue(struct obd_export *exp, LDLM_DEBUG(lock, "client-side enqueue END"); EXIT; cleanup: - switch (cleanup_phase) { - case 2: - if (rc) + if (cleanup_phase == 1 && rc) + failed_lock_cleanup(ns, lock, lockh, mode); + /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ + LDLM_LOCK_PUT(lock); + LDLM_LOCK_PUT(lock); + return rc; +} + +/* If a request has some specific initialisation it is passed in @reqp, + * otherwise it is created in ldlm_cli_enqueue. + * + * Supports sync and async requests, pass @async flag accordingly. If a + * request was created in ldlm_cli_enqueue and it is the async request, + * pass it to the caller in @reqp. */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_res_id res_id, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, int *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, void *lvb, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh, int async) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct ldlm_lock *lock; + struct ldlm_request *body; + struct ldlm_reply *reply; + int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [DLM_LOCKREQ_OFF] = sizeof(*body), + [DLM_REPLY_REC_OFF] = lvb_len }; + int is_replay = *flags & LDLM_FL_REPLAY; + int req_passed_in = 1, rc; + struct ptlrpc_request *req; + ENTRY; + + LASSERT(exp != NULL); + + /* If we're replaying this lock, just check some invariants. + * If we're creating a new lock, get everything all setup nice. */ + if (is_replay) { + lock = ldlm_handle2lock(lockh); + LDLM_DEBUG(lock, "client-side enqueue START"); + LASSERT(exp == lock->l_conn_export); + } else { + lock = ldlm_lock_create(ns, NULL, res_id, type, mode, blocking, + completion, glimpse, data, lvb_len); + if (lock == NULL) + RETURN(-ENOMEM); + /* for the local lock, add the reference */ + ldlm_lock_addref_internal(lock, mode); + ldlm_lock2handle(lock, lockh); + lock->l_lvb_swabber = lvb_swabber; + if (policy != NULL) { + /* INODEBITS_INTEROP: If the server does not support + * inodebits, we will request a plain lock in the + * descriptor (ldlm_lock2desc() below) but use an + * inodebits lock internally with both bits set. + */ + if (type == LDLM_IBITS && !(exp->exp_connect_flags & + OBD_CONNECT_IBITS)) + lock->l_policy_data.l_inodebits.bits = + MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_UPDATE; + else + lock->l_policy_data = *policy; + } + + if (type == LDLM_EXTENT) + lock->l_req_extent = policy->l_extent; + LDLM_DEBUG(lock, "client-side enqueue START"); + } + + /* lock not sent to server yet */ + + if (reqp == NULL || *reqp == NULL) { + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, + LDLM_ENQUEUE, 2, size, NULL); + if (req == NULL) { failed_lock_cleanup(ns, lock, lockh, mode); - case 1: - if (!req_passed_in && req != NULL) - ptlrpc_req_finished(req); + LDLM_LOCK_PUT(lock); + RETURN(-ENOMEM); + } + req_passed_in = 0; + if (reqp) + *reqp = req; + } else { + req = *reqp; + LASSERTF(lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF) == + sizeof(*body), "buflen[%d] = %d, not %d\n", + DLM_LOCKREQ_OFF, + lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF), + sizeof(*body)); } - LDLM_LOCK_PUT(lock); - return rc; + lock->l_conn_export = exp; + lock->l_export = NULL; + lock->l_blocking_ast = blocking; + + /* Dump lock data into the request buffer */ + body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF, sizeof(*body)); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = *flags; + body->lock_handle1 = *lockh; + + /* Continue as normal. */ + if (!req_passed_in) { + size[DLM_LOCKREPLY_OFF] = sizeof(*reply); + ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size); + } + + /* + * Liblustre client doesn't get extent locks, except for O_APPEND case + * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where + * [i_size, OBD_OBJECT_EOF] lock is taken. + */ + LASSERT(ergo(LIBLUSTRE_CLIENT, type != LDLM_EXTENT || + policy->l_extent.end == OBD_OBJECT_EOF)); + + if (async) { + LASSERT(reqp != NULL); + RETURN(0); + } + + LDLM_DEBUG(lock, "sending request"); + rc = ptlrpc_queue_wait(req); + rc = ldlm_cli_enqueue_fini(exp, req, type, policy ? 1 : 0, + mode, flags, lvb, lvb_len, lvb_swabber, + lockh, rc); + + if (!req_passed_in && req != NULL) { + ptlrpc_req_finished(req); + if (reqp) + *reqp = NULL; + } + + RETURN(rc); } static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode, diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index db886b7..1cf9841 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -422,37 +422,40 @@ static void llu_truncate(struct inode *inode, obd_flag flags) { struct llu_inode_info *lli = llu_i2info(inode); struct intnl_stat *st = llu_i2stat(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obdo oa = {0}; + struct obd_info oinfo = { { { 0 } } }; + struct obdo oa = { 0 }; int rc; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%llu/%lu(%p) to %llu\n", (long long)st->st_ino, lli->lli_st_generation, inode, (long long)st->st_size); - if (!lsm) { + if (!lli->lli_smd) { CDEBUG(D_INODE, "truncate on inode %llu with no objects\n", (long long)st->st_ino); EXIT; return; } - oa.o_id = lsm->lsm_object_id; + oinfo.oi_md = lli->lli_smd; + oinfo.oi_policy.l_extent.start = st->st_size; + oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; + oinfo.oi_oa = &oa; + oa.o_id = lli->lli_smd->lsm_object_id; oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; oa.o_flags = flags; /* We don't actually want to copy inode flags */ - + obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); - obd_adjust_kms(llu_i2obdexp(inode), lsm, st->st_size, 1); + obd_adjust_kms(llu_i2obdexp(inode), lli->lli_smd, st->st_size, 1); CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n", oa.o_id, (long long)st->st_size); /* truncate == punch from new size to absolute end of file */ - rc = obd_punch(llu_i2obdexp(inode), &oa, lsm, st->st_size, - OBD_OBJECT_EOF, NULL); + rc = obd_punch_rqset(llu_i2obdexp(inode), &oinfo, NULL); if (rc) CERROR("obd_truncate fails (%d) ino %llu\n", rc, (long long)st->st_ino); diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 2c5f924..ee43f1f 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -225,18 +225,28 @@ int llu_glimpse_size(struct inode *inode) struct llu_inode_info *lli = llu_i2info(inode); struct intnl_stat *st = llu_i2stat(inode); struct llu_sb_info *sbi = llu_i2sbi(inode); - ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; struct lustre_handle lockh = { 0 }; + struct obd_enqueue_info einfo = { 0 }; + struct obd_info oinfo = { { { 0 } } }; struct ost_lvb lvb; - int rc, flags = LDLM_FL_HAS_INTENT; + int rc; ENTRY; CDEBUG(D_DLMTRACE, "Glimpsing inode %llu\n", (long long)st->st_ino); - rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy, - LCK_PR, &flags, llu_extent_lock_callback, - ldlm_completion_ast, llu_glimpse_callback, inode, - sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh); + einfo.ei_type = LDLM_EXTENT; + einfo.ei_mode = LCK_PR; + einfo.ei_flags = LDLM_FL_HAS_INTENT; + einfo.ei_cb_bl = llu_extent_lock_callback; + einfo.ei_cb_cp = ldlm_completion_ast; + einfo.ei_cb_gl = llu_glimpse_callback; + einfo.ei_cbdata = inode; + + oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; + oinfo.oi_lockh = &lockh; + oinfo.oi_md = lli->lli_smd; + + rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo); if (rc) { CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc); RETURN(rc > 0 ? -EIO : rc); @@ -253,8 +263,6 @@ int llu_glimpse_size(struct inode *inode) CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n", (long long)st->st_size, (long long)st->st_blocks); - obd_cancel(sbi->ll_osc_exp, lli->lli_smd, LCK_PR, &lockh); - RETURN(rc); } @@ -265,6 +273,8 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, { struct llu_sb_info *sbi = llu_i2sbi(inode); struct intnl_stat *st = llu_i2stat(inode); + struct obd_enqueue_info einfo = { 0 }; + struct obd_info oinfo = { { { 0 } } }; struct ost_lvb lvb; int rc; ENTRY; @@ -281,10 +291,20 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, (long long)st->st_ino, policy->l_extent.start, policy->l_extent.end); - rc = obd_enqueue(sbi->ll_osc_exp, lsm, LDLM_EXTENT, policy, mode, - &ast_flags, llu_extent_lock_callback, - ldlm_completion_ast, llu_glimpse_callback, inode, - sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh); + einfo.ei_type = LDLM_EXTENT; + einfo.ei_mode = mode; + einfo.ei_flags = ast_flags; + einfo.ei_cb_bl = llu_extent_lock_callback; + einfo.ei_cb_cp = ldlm_completion_ast; + einfo.ei_cb_gl = llu_glimpse_callback; + einfo.ei_cbdata = inode; + + oinfo.oi_policy = *policy; + oinfo.oi_lockh = lockh; + oinfo.oi_md = lsm; + + rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo); + *policy = oinfo.oi_policy; if (rc > 0) rc = -EIO; diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 4ee0dc9..7b3d2d7 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -308,7 +308,8 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) struct llu_inode_info *lli = llu_i2info(inode); struct obd_export *exp = llu_i2obdexp(inode); struct ptlrpc_request_set *set; - struct obdo oa; + struct obd_info oinfo = { { { 0 } } }; + struct obdo oa = { 0 }; obd_flag refresh_valid; int rc; ENTRY; @@ -316,7 +317,8 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) LASSERT(lsm); LASSERT(lli); - memset(&oa, 0, sizeof oa); + oinfo.oi_md = lsm; + oinfo.oi_oa = &oa; oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | @@ -328,7 +330,7 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) CERROR ("ENOMEM allocing request set\n"); rc = -ENOMEM; } else { - rc = obd_getattr_async(exp, &oa, lsm, set); + rc = obd_getattr_async(exp, &oinfo, set); if (rc == 0) rc = ptlrpc_set_wait(set); ptlrpc_set_destroy(set); @@ -781,17 +783,23 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) rc = err; } } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { + struct obd_info oinfo = { { { 0 } } }; struct obdo oa; CDEBUG(D_INODE, "set mtime on OST inode %llu to %lu\n", (long long)st->st_ino, LTIME_S(attr->ia_mtime)); oa.o_id = lsm->lsm_object_id; oa.o_valid = OBD_MD_FLID; + obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); - rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL); + + oinfo.oi_oa = &oa; + oinfo.oi_md = lsm; + + rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL); if (rc) - CERROR("obd_setattr fails: rc=%d\n", rc); + CERROR("obd_setattr_async fails: rc=%d\n", rc); } RETURN(rc); } @@ -1112,7 +1120,8 @@ static int llu_statfs_internal(struct llu_sb_info *sbi, CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files); - rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age); + rc = obd_statfs_rqset(class_exp2obd(sbi->ll_osc_exp), + &obd_statfs, max_age); if (rc) { CERROR("obd_statfs fails: rc = %d\n", rc); RETURN(rc); @@ -1271,7 +1280,6 @@ static int llu_file_flock(struct inode *ino, int cmd, struct file_lock *file_lock) { - struct obd_device *obddev; struct llu_inode_info *lli = llu_i2info(ino); struct intnl_stat *st = llu_i2stat(ino); struct ldlm_res_id res_id = @@ -1340,11 +1348,10 @@ static int llu_file_flock(struct inode *ino, "start="LPU64", end="LPU64"\n", st->st_ino, flock.l_flock.pid, flags, mode, flock.l_flock.start, flock.l_flock.end); - obddev = llu_i2mdcexp(ino)->exp_obd; - rc = ldlm_cli_enqueue(llu_i2mdcexp(ino), NULL, obddev->obd_namespace, - res_id, LDLM_FLOCK, &flock, mode, &flags, - NULL, ldlm_flock_completion_ast, NULL, file_lock, - NULL, 0, NULL, &lockh); + rc = ldlm_cli_enqueue(llu_i2mdcexp(ino), NULL, res_id, + LDLM_FLOCK, &flock, mode, &flags, NULL, + ldlm_flock_completion_ast, NULL, + file_lock, NULL, 0, NULL, &lockh, 0); RETURN(rc); } diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index d2c2a4f..2a09cb0 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -185,19 +185,18 @@ Espan: // error = "inode out of bounds"; bad_entry: CERROR("%s: bad entry in directory %lu/%u: %s - " - "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d", - ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino, - dir->i_generation, error, (page->index<inode), - rec_len, p->name_len); + "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d\n", + ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino, + dir->i_generation, error, page->index << PAGE_CACHE_SHIFT, offs, + (unsigned long)le32_to_cpu(p->inode), + rec_len, p->name_len); goto fail; Eend: p = (ext2_dirent *)(kaddr + offs); - CERROR("ext2_check_page" - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<inode)); + CERROR("%s: entry in directory %lu/%u spans the page boundary " + "offset=%lu+%u, inode=%lu\n",ll_i2mdcexp(dir)->exp_obd->obd_name, + dir->i_ino, dir->i_generation, page->index << PAGE_CACHE_SHIFT, + offs, (unsigned long)le32_to_cpu(p->inode)); fail: SetPageChecked(page); SetPageError(page); @@ -602,7 +601,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, struct lov_stripe_md *lsm; struct lov_user_md_join *lmj; int lmj_size, i, aindex = 0, rc; - + rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize); if (rc < 0) GOTO(out_req, rc = -ENOMEM); @@ -652,7 +651,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, out_free_memmd: obd_free_memmd(sbi->ll_osc_exp, &lsm); if (rc) - GOTO(out_req, rc); + GOTO(out_lmm, rc); } if (cmd == IOC_MDC_GETFILEINFO) { struct lov_user_mds_data *lmdp; @@ -675,19 +674,20 @@ out_free_memmd: lmdp = (struct lov_user_mds_data *)arg; rc = copy_to_user(&lmdp->lmd_st, &st, sizeof(st)); if (rc) - GOTO(out_req, rc = -EFAULT); + GOTO(out_lmm, rc = -EFAULT); lump = &lmdp->lmd_lmm; } else { lump = (struct lov_user_md *)arg; } rc = copy_to_user(lump, lmm, lmmsize); - if (lmm->lmm_magic == LOV_MAGIC_JOIN) - OBD_FREE(lmm, lmmsize); if (rc) - GOTO(out_req, rc = -EFAULT); + GOTO(out_lmm, rc = -EFAULT); EXIT; + out_lmm: + if (lmm->lmm_magic == LOV_MAGIC_JOIN) + OBD_FREE(lmm, lmmsize); out_req: ptlrpc_req_finished(request); out_name: diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 9e2724c..f5f3f89 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -502,12 +502,15 @@ int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm, struct obdo *oa) { struct ptlrpc_request_set *set; + struct obd_info oinfo = { { { 0 } } }; int rc; ENTRY; LASSERT(lsm != NULL); memset(oa, 0, sizeof *oa); + oinfo.oi_md = lsm; + oinfo.oi_oa = oa; oa->o_id = lsm->lsm_object_id; oa->o_mode = S_IFREG; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | @@ -518,7 +521,7 @@ int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm, if (set == NULL) { rc = -ENOMEM; } else { - rc = obd_getattr_async(exp, oa, lsm, set); + rc = obd_getattr_async(exp, &oinfo, set); if (rc == 0) rc = ptlrpc_set_wait(set); ptlrpc_set_destroy(set); @@ -905,8 +908,9 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) { struct ll_inode_info *lli = ll_i2info(inode); struct ll_sb_info *sbi = ll_i2sbi(inode); - ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; struct lustre_handle lockh = { 0 }; + struct obd_enqueue_info einfo = { 0 }; + struct obd_info oinfo = { { { 0 } } }; struct ost_lvb lvb; int rc; ENTRY; @@ -918,8 +922,6 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) RETURN(0); } - ast_flags |= LDLM_FL_HAS_INTENT; - /* NOTE: this looks like DLM lock request, but it may not be one. Due * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that * won't revoke any conflicting DLM locks held. Instead, @@ -927,10 +929,19 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) * holding a DLM lock against this file, and resulting size * will be returned for each stripe. DLM lock on [0, EOF] is * acquired only if there were no conflicting locks. */ - rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy, - LCK_PR, &ast_flags, ll_extent_lock_callback, - ldlm_completion_ast, ll_glimpse_callback, inode, - sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh); + einfo.ei_type = LDLM_EXTENT; + einfo.ei_mode = LCK_PR; + einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT; + einfo.ei_cb_bl = ll_extent_lock_callback; + einfo.ei_cb_cp = ldlm_completion_ast; + einfo.ei_cb_gl = ll_glimpse_callback; + einfo.ei_cbdata = inode; + + oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; + oinfo.oi_lockh = &lockh; + oinfo.oi_md = lli->lli_smd; + + rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo); if (rc == -ENOENT) RETURN(rc); if (rc != 0) { @@ -951,8 +962,6 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n", inode->i_size, inode->i_blocks); - obd_cancel(sbi->ll_osc_exp, lli->lli_smd, LCK_PR, &lockh); - RETURN(rc); } @@ -963,6 +972,8 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ost_lvb lvb; + struct obd_enqueue_info einfo = { 0 }; + struct obd_info oinfo = { { { 0 } } }; int rc; ENTRY; @@ -981,10 +992,20 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n", inode->i_ino, policy->l_extent.start, policy->l_extent.end); - rc = obd_enqueue(sbi->ll_osc_exp, lsm, LDLM_EXTENT, policy, mode, - &ast_flags, ll_extent_lock_callback, - ldlm_completion_ast, ll_glimpse_callback, inode, - sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh); + einfo.ei_type = LDLM_EXTENT; + einfo.ei_mode = mode; + einfo.ei_flags = ast_flags; + einfo.ei_cb_bl = ll_extent_lock_callback; + einfo.ei_cb_cp = ldlm_completion_ast; + einfo.ei_cb_gl = ll_glimpse_callback; + einfo.ei_cbdata = inode; + + oinfo.oi_policy = *policy; + oinfo.oi_lockh = lockh; + oinfo.oi_md = lsm; + + rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo); + *policy = oinfo.oi_policy; if (rc > 0) rc = -EIO; @@ -1933,7 +1954,6 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) { struct inode *inode = file->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_device *obddev; struct ldlm_res_id res_id = { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} }; struct lustre_handle lockh = {0}; @@ -2004,11 +2024,10 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid, flags, mode, flock.l_flock.start, flock.l_flock.end); - obddev = sbi->ll_mdc_exp->exp_obd; - rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, obddev->obd_namespace, - res_id, LDLM_FLOCK, &flock, mode, &flags, - NULL, ldlm_flock_completion_ast, NULL, file_lock, - NULL, 0, NULL, &lockh); + rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, res_id, + LDLM_FLOCK, &flock, mode, &flags, NULL, + ldlm_flock_completion_ast, NULL, file_lock, + NULL, 0, NULL, &lockh, 0); RETURN(rc); } diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index c773745..b233bd1 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1289,22 +1289,27 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) } } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { obd_flag flags; + struct obd_info oinfo = { { { 0 } } }; struct obdo oa; CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", inode->i_ino, LTIME_S(attr->ia_mtime)); - + oa.o_id = lsm->lsm_object_id; oa.o_valid = OBD_MD_FLID; flags = OBD_MD_FLTYPE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLFID | OBD_MD_FLGENER; - + obdo_from_inode(&oa, inode, flags); - rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL); + + oinfo.oi_oa = &oa; + oinfo.oi_md = lsm; + + rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL); if (rc) - CERROR("obd_setattr fails: rc=%d\n", rc); + CERROR("obd_setattr_async fails: rc=%d\n", rc); } RETURN(rc); } @@ -1333,7 +1338,8 @@ int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files); - rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age); + rc = obd_statfs_rqset(class_exp2obd(sbi->ll_osc_exp), + &obd_osfs, max_age); if (rc) { CERROR("obd_statfs fails: rc = %d\n", rc); RETURN(rc); @@ -1645,42 +1651,42 @@ int ll_iocontrol(struct inode *inode, struct file *file, } case EXT3_IOC_SETFLAGS: { struct mdc_op_data op_data; - struct ll_iattr_struct attr; - struct obdo *oa; + struct ll_iattr_struct attr = { 0 }; + struct obd_info oinfo = { { { 0 } } }; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; if (get_user(flags, (int *)arg)) RETURN(-EFAULT); - oa = obdo_alloc(); - if (!oa) + oinfo.oi_md = lsm; + oinfo.oi_oa = obdo_alloc(); + if (!oinfo.oi_oa) RETURN(-ENOMEM); ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); - memset(&attr, 0x0, sizeof(attr)); attr.ia_attr_flags = flags; ((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG; rc = mdc_setattr(sbi->ll_mdc_exp, &op_data, (struct iattr *)&attr, NULL, 0, NULL, 0, &req); + ptlrpc_req_finished(req); if (rc || lsm == NULL) { - ptlrpc_req_finished(req); - obdo_free(oa); + obdo_free(oinfo.oi_oa); RETURN(rc); } - ptlrpc_req_finished(req); - oa->o_id = lsm->lsm_object_id; - oa->o_flags = flags; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; + oinfo.oi_oa->o_id = lsm->lsm_object_id; + oinfo.oi_oa->o_flags = flags; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; - obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); - rc = obd_setattr(sbi->ll_osc_exp, oa, lsm, NULL); - obdo_free(oa); + obdo_from_inode(oinfo.oi_oa, inode, + OBD_MD_FLFID | OBD_MD_FLGENER); + rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL); + obdo_free(oinfo.oi_oa); if (rc) { if (rc != -EPERM && rc != -EACCES) - CERROR("mdc_setattr fails: rc = %d\n", rc); + CERROR("mdc_setattr_async fails: rc = %d\n", rc); RETURN(rc); } diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 7ed9b14..025d6e8 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -180,13 +180,15 @@ static int ll_rd_max_readahead_mb(char *page, char **start, off_t off, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - unsigned val; + long pages_number; + int mult; spin_lock(&sbi->ll_lock); - val = sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT); + pages_number = sbi->ll_ra_info.ra_max_pages; spin_unlock(&sbi->ll_lock); - return snprintf(page, count, "%u\n", val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); } static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, @@ -194,20 +196,21 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int val, rc; + int mult, rc, pages_number; - rc = lprocfs_write_helper(buffer, count, &val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); if (rc) return rc; - if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) { + if (pages_number < 0 || pages_number > num_physpages / 2) { CERROR("can't set file readahead more than %lu MB\n", - num_physpages >> (20 - PAGE_CACHE_SHIFT - 1)); + num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/ return -ERANGE; } spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages = val << (20 - PAGE_CACHE_SHIFT); + sbi->ll_ra_info.ra_max_pages = pages_number; spin_unlock(&sbi->ll_lock); return count; @@ -218,14 +221,15 @@ static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - unsigned val; + long pages_number; + int mult; spin_lock(&sbi->ll_lock); - val = sbi->ll_ra_info.ra_max_read_ahead_whole_pages >> - (20 - PAGE_CACHE_SHIFT); + pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages; spin_unlock(&sbi->ll_lock); - return snprintf(page, count, "%u\n", val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); } static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, @@ -233,16 +237,16 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int val, rc; + int mult, rc, pages_number; - rc = lprocfs_write_helper(buffer, count, &val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); if (rc) return rc; /* Cap this at the current max readahead window size, the readahead * algorithm does this anyway so it's pointless to set it larger. */ - if (val < 0 || - val > (sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT))) { + if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) { CERROR("can't set max_read_ahead_whole_mb more than " "max_read_ahead_mb: %lu\n", sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT)); @@ -250,8 +254,7 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, } spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_read_ahead_whole_pages = - val << (20 - PAGE_CACHE_SHIFT); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; spin_unlock(&sbi->ll_lock); return count; @@ -262,13 +265,15 @@ static int ll_rd_max_cached_mb(char *page, char **start, off_t off, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - unsigned val; + long pages_number; + int mult; spin_lock(&sbi->ll_lock); - val = sbi->ll_async_page_max >> (20 - PAGE_CACHE_SHIFT); + pages_number = sbi->ll_async_page_max; spin_unlock(&sbi->ll_lock); - return snprintf(page, count, "%u\n", val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult);; } static int ll_wr_max_cached_mb(struct file *file, const char *buffer, @@ -276,20 +281,21 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int val, rc; + int mult, rc, pages_number; - rc = lprocfs_write_helper(buffer, count, &val); + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); if (rc) return rc; - if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT))) { + if (pages_number < 0 || pages_number > num_physpages) { CERROR("can't set max cache more than %lu MB\n", num_physpages >> (20 - PAGE_CACHE_SHIFT)); return -ERANGE; } spin_lock(&sbi->ll_lock); - sbi->ll_async_page_max = val << (20 - PAGE_CACHE_SHIFT); + sbi->ll_async_page_max = pages_number ; spin_unlock(&sbi->ll_lock); if (sbi->ll_async_page_count >= sbi->ll_async_page_max) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 49083db..56a4694 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -62,6 +62,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, { struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; + struct obd_info oinfo = { { { 0 } } }; struct brw_page pg; int rc; ENTRY; @@ -92,7 +93,9 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, else lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_BRW_READ, pg.count); - rc = obd_brw(cmd, ll_i2obdexp(inode), oa, lsm, 1, &pg, NULL); + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + rc = obd_brw(cmd, ll_i2obdexp(inode), &oinfo, 1, &pg, NULL); if (rc == 0) obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS); else if (rc != -EIO) @@ -109,7 +112,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, void ll_truncate(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; + struct obd_info oinfo = { { { 0 } } }; struct ost_lvb lvb; struct obdo oa; int rc; @@ -122,7 +125,7 @@ void ll_truncate(struct inode *inode) return; } - if (!lsm) { + if (!lli->lli_smd) { CDEBUG(D_INODE, "truncate on inode %lu with no objects\n", inode->i_ino); GOTO(out_unlock, 0); @@ -132,18 +135,18 @@ void ll_truncate(struct inode *inode) /* XXX I'm pretty sure this is a hack to paper over a more fundamental * race condition. */ - lov_stripe_lock(lsm); + lov_stripe_lock(lli->lli_smd); inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 0); + obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0); if (lvb.lvb_size == inode->i_size) { CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n", - lsm->lsm_object_id, inode->i_size, inode->i_size); - lov_stripe_unlock(lsm); + lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size); + lov_stripe_unlock(lli->lli_smd); GOTO(out_unlock, 0); } - obd_adjust_kms(ll_i2obdexp(inode), lsm, inode->i_size, 1); - lov_stripe_unlock(lsm); + obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd, inode->i_size, 1); + lov_stripe_unlock(lli->lli_smd); if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_CHECKSUM) && (inode->i_size & ~PAGE_MASK))) { @@ -163,9 +166,13 @@ void ll_truncate(struct inode *inode) } CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n", - lsm->lsm_object_id, inode->i_size, inode->i_size); + lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size); - oa.o_id = lsm->lsm_object_id; + oinfo.oi_md = lli->lli_smd; + oinfo.oi_policy.l_extent.start = inode->i_size; + oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; + oinfo.oi_oa = &oa; + oa.o_id = lli->lli_smd->lsm_object_id; oa.o_valid = OBD_MD_FLID; obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | @@ -174,8 +181,7 @@ void ll_truncate(struct inode *inode) ll_inode_size_unlock(inode, 0); - rc = obd_punch(ll_i2obdexp(inode), &oa, lsm, inode->i_size, - OBD_OBJECT_EOF, NULL); + rc = obd_punch_rqset(ll_i2obdexp(inode), &oinfo, NULL); if (rc) CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino); else @@ -195,6 +201,7 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; obd_off offset = ((obd_off)page->index) << PAGE_SHIFT; + struct obd_info oinfo = { { { 0 } } }; struct brw_page pga; struct obdo oa; struct ost_lvb lvb; @@ -215,8 +222,9 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); - rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oa, lsm, - 1, &pga, NULL); + oinfo.oi_oa = &oa; + oinfo.oi_md = lsm; + rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oinfo, 1, &pga, NULL); if (rc) RETURN(rc); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index f24baf1..3cded8a 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -63,7 +63,6 @@ static int ll_direct_IO_24(int rw, struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; struct brw_page *pga; - struct ptlrpc_request_set *set; struct obdo oa; int length, i, flags, rc = 0; loff_t offset; @@ -77,15 +76,9 @@ static int ll_direct_IO_24(int rw, (iobuf->length & (PAGE_SIZE - 1))) RETURN(-EINVAL); - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); - OBD_ALLOC(pga, sizeof(*pga) * iobuf->nr_pages); - if (!pga) { - ptlrpc_set_destroy(set); + if (!pga) RETURN(-ENOMEM); - } flags = 0 /* | OBD_BRW_DIRECTIO */; offset = ((obd_off)blocknr << inode->i_blkbits); @@ -112,17 +105,8 @@ static int ll_direct_IO_24(int rw, else lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_DIRECT_READ, iobuf->length); - rc = obd_brw_async(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages, - pga, set, NULL); - if (rc) { - CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, - "error from obd_brw_async: rc = %d\n", rc); - } else { - rc = ptlrpc_set_wait(set); - if (rc) - CERROR("error from callback: rc = %d\n", rc); - } - ptlrpc_set_destroy(set); + rc = obd_brw_rqset(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages, + pga, NULL); if (rc == 0) { rc = iobuf->length; if (rw == OBD_BRW_WRITE) { diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index df1c812..a16808d 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -74,10 +74,160 @@ static int ll_releasepage(struct page *page, gfp_t gfp_mask) return 1; } +#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL + +static inline int ll_get_user_pages(int rw, unsigned long user_addr, + size_t size, struct page ***pages) +{ + int result = -ENOMEM; + unsigned long page_count; + + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) { + *pages = NULL; + return -EFBIG; + } + + page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count -= user_addr >> PAGE_SHIFT; + + OBD_ALLOC_GFP(*pages, page_count * sizeof(**pages), GFP_KERNEL); + if (*pages) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + page_count, (rw == READ), 0, *pages, + NULL); + up_read(¤t->mm->mmap_sem); + } + + return result; +} + +/* ll_free_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer */ +static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) +{ + int i; + + for (i = 0; i < npages; i++) { + if (do_dirty) + set_page_dirty_lock(pages[i]); + page_cache_release(pages[i]); + } + + OBD_FREE(pages, npages * sizeof(*pages)); +} + +static ssize_t ll_direct_IO_26_seg(int rw, struct file *file, + struct address_space *mapping, + struct inode *inode, + struct lov_stripe_md *lsm, + unsigned long user_addr, size_t size, + loff_t file_offset, struct page **pages, + int page_count) +{ + struct brw_page *pga; + struct obdo oa; + int i, rc = 0; + size_t length; + ENTRY; + + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (!pga) + RETURN(-ENOMEM); + + for (i = 0, length = size; length > 0; + length -=pga[i].count, file_offset +=pga[i].count,i++) {/*i last!*/ + pga[i].pg = pages[i]; + pga[i].off = file_offset; + /* To the end of the page, or the length, whatever is less */ + pga[i].count = min_t(int, PAGE_SIZE -(file_offset & ~PAGE_MASK), + length); + pga[i].flag = 0; + if (rw == READ) + POISON_PAGE(pages[i], 0x0d); + } + + ll_inode_fill_obdo(inode, rw, &oa); + + if (rw == WRITE) + lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, + LPROC_LL_DIRECT_WRITE, size); + else + lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, + LPROC_LL_DIRECT_READ, size); + rc = obd_brw_rqset(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, + ll_i2obdexp(inode), &oa, lsm, page_count, pga, NULL); + if (rc == 0) { + rc = size; + if (rw == WRITE) { + lov_stripe_lock(lsm); + obd_adjust_kms(ll_i2obdexp(inode), lsm, file_offset, 0); + lov_stripe_unlock(lsm); + } + } + + OBD_FREE(pga, sizeof(*pga) * page_count); + RETURN(rc); +} + +static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + ssize_t count = iov_length(iov, nr_segs), tot_bytes = 0; + struct ll_inode_info *lli = ll_i2info(file->f_mapping->host); + unsigned long seg = 0; + ENTRY; + + if (!lli->lli_smd || !lli->lli_smd->lsm_object_id) + RETURN(-EBADF); + + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if ((file_offset & (PAGE_SIZE - 1)) || (count & (PAGE_SIZE - 1))) + RETURN(-EINVAL); + + while ((seg < nr_segs) && (tot_bytes >= 0)) { + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long)vec->iov_base; + size_t size = vec->iov_len; + struct page **pages; + int page_count; + ssize_t result; + + page_count = ll_get_user_pages(rw, user_addr, size, &pages); + if (page_count < 0) { + ll_free_user_pages(pages, 0, 0); + if (tot_bytes > 0) + break; + return page_count; + } + + result = ll_direct_IO_26_seg(rw, file, file->f_mapping, + file->f_mapping->host, + lli->lli_smd, user_addr, size, + file_offset, pages, page_count); + ll_free_user_pages(pages, page_count, rw == READ); + + if (result <= 0) { + if (tot_bytes > 0) + break; + return result; + } + + tot_bytes += result; + file_offset += result; + if (result < size) + break; + } + return tot_bytes; +} + struct address_space_operations ll_aops = { .readpage = ll_readpage, // .readpages = ll_readpages, -// .direct_IO = ll_direct_IO_26, + .direct_IO = ll_direct_IO_26, .writepage = ll_writepage_26, .writepages = generic_writepages, .set_page_dirty = __set_page_dirty_nobuffers, diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index f020980..8b994ca 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -20,29 +20,34 @@ struct lov_lock_handles { }; struct lov_request { + struct obd_info rq_oi; + struct lov_request_set *rq_rqset; + struct list_head rq_link; - struct ldlm_extent rq_extent; + int rq_idx; /* index in lov->tgts array */ int rq_stripe; /* stripe number */ int rq_complete; int rq_rc; int rq_buflen; /* length of sub_md */ - struct obdo *rq_oa; - struct lov_stripe_md *rq_md; + obd_count rq_oabufs; obd_count rq_pgaidx; }; struct lov_request_set { + struct obd_enqueue_info *set_ei; + struct obd_info *set_oi; atomic_t set_refcount; struct obd_export *set_exp; + /* XXX: There is @set_exp already, however obd_statfs gets obd_device + only. */ + struct obd_device *set_obd; int set_count; int set_completes; int set_success; struct llog_cookie *set_cookies; int set_cookie_sent; - struct lov_stripe_md *set_md; - struct obdo *set_oa; struct obd_trans_info *set_oti; obd_count set_oabufs; struct brw_page *set_pga; @@ -140,65 +145,71 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req); /* lov_request.c */ void lov_set_add_req(struct lov_request *req, struct lov_request_set *set); +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc); int lov_update_common_set(struct lov_request_set *set, struct lov_request *req, int rc); -int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **ea, - struct obdo *src_oa, struct obd_trans_info *oti, +int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo, + struct lov_stripe_md **ea, struct obdo *src_oa, + struct obd_trans_info *oti, struct lov_request_set **reqset); int lov_update_create_set(struct lov_request_set *set, struct lov_request *req, int rc); int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea); -int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_trans_info *oti, +int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti, struct lov_request_set **reqset); int lov_fini_brw_set(struct lov_request_set *set); -int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, struct lov_request_set **reqset); int lov_fini_getattr_set(struct lov_request_set *set); -int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, struct obd_trans_info *oti, struct lov_request_set **reqset); int lov_update_destroy_set(struct lov_request_set *set, struct lov_request *req, int rc); int lov_fini_destroy_set(struct lov_request_set *set); -int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti, +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, struct lov_request_set **reqset); int lov_update_setattr_set(struct lov_request_set *set, struct lov_request *req, int rc); int lov_fini_setattr_set(struct lov_request_set *set); -int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_off start, - obd_off end, struct obd_trans_info *oti, +int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, struct lov_request_set **reqset); -int lov_update_punch_set(struct lov_request_set *set, struct lov_request *req, - int rc); int lov_fini_punch_set(struct lov_request_set *set); -int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa, +int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info, + struct obdo *src_oa, struct lov_stripe_md *lsm, obd_off start, obd_off end, struct lov_request_set **reqset); int lov_fini_sync_set(struct lov_request_set *set); -int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_policy_data_t *policy, __u32 mode, - struct lustre_handle *lockh, +int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_enqueue_info *einfo, struct lov_request_set **reqset); -int lov_update_enqueue_set(struct lov_request_set *set, - struct lov_request *req, int rc, int flags); int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode); -int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm, +int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, ldlm_policy_data_t *policy, __u32 mode, struct lustre_handle *lockh, struct lov_request_set **reqset); int lov_update_match_set(struct lov_request_set *set, struct lov_request *req, int rc); int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags); -int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, +int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, __u32 mode, struct lustre_handle *lockh, struct lov_request_set **reqset); int lov_fini_cancel_set(struct lov_request_set *set); +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); +void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs, + struct obd_statfs *lov_sfs, int success); +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, + int success); +int lov_fini_statfs_set(struct lov_request_set *set); /* lov_obd.c */ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count); diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c index ff20962..d73e231 100644 --- a/lustre/lov/lov_merge.c +++ b/lustre/lov/lov_merge.c @@ -40,7 +40,7 @@ /* Merge the lock value block(&lvb) attributes from each of the stripes in a * file into a single lvb. It is expected that the caller initializes the - * current atime, mtime, ctime to avoid regressing a more uptodate time on + * current atime, mtime, ctime to avoid regressing a more uptodate time on * the local client. * * If @kms_only is set then we do not consider the recently seen size (rss) @@ -74,7 +74,7 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, lov_size = lov_stripe_size(lsm, tmpsize, i); if (lov_size > size) size = lov_size; - /* merge blocks, mtime, atime */ + /* merge blocks, mtime, atime */ blocks += loi->loi_lvb.lvb_blocks; if (loi->loi_lvb.lvb_mtime > current_mtime) current_mtime = loi->loi_lvb.lvb_mtime; @@ -86,9 +86,9 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, lvb->lvb_size = size; lvb->lvb_blocks = blocks; - lvb->lvb_mtime = current_mtime; - lvb->lvb_atime = current_atime; - lvb->lvb_ctime = current_ctime; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; RETURN(0); } diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 6bb05e5..76fccfb 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -962,6 +962,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, struct lov_stripe_md **ea, struct obd_trans_info *oti) { struct lov_obd *lov; + struct obd_info oinfo; struct lov_request_set *set = NULL; struct obd_statfs osfs; unsigned long maxage; @@ -991,16 +992,16 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, } maxage = cfs_time_shift(-lov->desc.ld_qos_maxage); - obd_statfs(exp->exp_obd, &osfs, maxage); + obd_statfs_rqset(exp->exp_obd, &osfs, maxage); - rc = lov_prep_create_set(exp, ea, src_oa, oti, &set); + rc = lov_prep_create_set(exp, &oinfo, ea, src_oa, oti, &set); if (rc) RETURN(rc); list_for_each_entry(req, &set->set_list, rq_link) { /* XXX: LOV STACKING: use real "obj_mdp" sub-data */ rc = obd_create(lov->tgts[req->rq_idx].ltd_exp, - req->rq_oa, &req->rq_md, oti); + req->rq_oi.oi_oa, &req->rq_oi.oi_md, oti); lov_update_create_set(set, req, rc); } rc = lov_fini_create_set(set, ea); @@ -1020,10 +1021,11 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa, struct obd_export *md_exp) { struct lov_request_set *set; + struct obd_info oinfo; struct lov_request *req; struct list_head *pos; struct lov_obd *lov; - int rc = 0; + int rc = 0, err; ENTRY; ASSERT_LSM_MAGIC(lsm); @@ -1031,8 +1033,13 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa, if (!exp || !exp->exp_obd) RETURN(-ENODEV); + if (oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + LASSERT(oti->oti_logcookies); + } + lov = &exp->exp_obd->u.lov; - rc = lov_prep_destroy_set(exp, oa, lsm, oti, &set); + rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set); if (rc) RETURN(rc); @@ -1040,30 +1047,31 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa, int err; req = list_entry(pos, struct lov_request, rq_link); - /* XXX update the cookie position */ - oti->oti_logcookies = set->set_cookies + req->rq_stripe; - rc = obd_destroy(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa, - NULL, oti, NULL); - err = lov_update_common_set(set, req, rc); + if (oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; + + err = obd_destroy(lov->tgts[req->rq_idx].ltd_exp, + req->rq_oi.oi_oa, NULL, oti, NULL); + err = lov_update_common_set(set, req, err); if (err) { CERROR("error: destroying objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - set->set_oa->o_id, req->rq_oa->o_id, - req->rq_idx, rc); + oa->o_id, req->rq_oi.oi_oa->o_id, + req->rq_idx, err); if (!rc) rc = err; } } - rc = lov_fini_destroy_set(set); + if (rc == 0) { LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp); } - RETURN(rc); + err = lov_fini_destroy_set(set); + RETURN(rc ? rc : err); } -static int lov_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm) +static int lov_getattr(struct obd_export *exp, struct obd_info *oinfo) { struct lov_request_set *set; struct lov_request *req; @@ -1072,14 +1080,15 @@ static int lov_getattr(struct obd_export *exp, struct obdo *oa, int err = 0, rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (!exp || !exp->exp_obd) RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_getattr_set(exp, oa, lsm, &set); + rc = lov_prep_getattr_set(exp, oinfo, &set); if (rc) RETURN(rc); @@ -1087,16 +1096,15 @@ static int lov_getattr(struct obd_export *exp, struct obdo *oa, req = list_entry(pos, struct lov_request, rq_link); CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " - "%u\n", oa->o_id, req->rq_stripe, req->rq_oa->o_id, - req->rq_idx); + "%u\n", oinfo->oi_oa->o_id, req->rq_stripe, + req->rq_oi.oi_oa->o_id, req->rq_idx); - rc = obd_getattr(lov->tgts[req->rq_idx].ltd_exp, - req->rq_oa, NULL); + rc = obd_getattr(lov->tgts[req->rq_idx].ltd_exp, &req->rq_oi); err = lov_update_common_set(set, req, rc); if (err) { CERROR("error: getattr objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - set->set_oa->o_id, req->rq_oa->o_id, + oinfo->oi_oa->o_id, req->rq_oi.oi_oa->o_id, req->rq_idx, err); break; } @@ -1108,78 +1116,79 @@ static int lov_getattr(struct obd_export *exp, struct obdo *oa, RETURN(rc); } -static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, void *data, - int rc) +static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) { struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; ENTRY; /* don't do attribute merge if this aysnc op failed */ - if (rc) { + if (rc) lovset->set_completes = 0; - lov_fini_getattr_set(lovset); - } else { - rc = lov_fini_getattr_set(lovset); - } - RETURN (rc); + err = lov_fini_getattr_set(lovset); + RETURN(rc ? rc : err); } -static int lov_getattr_async(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, +static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo, struct ptlrpc_request_set *rqset) { struct lov_request_set *lovset; struct lov_obd *lov; struct list_head *pos; struct lov_request *req; - int rc = 0; + int rc = 0, err; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (!exp || !exp->exp_obd) RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_getattr_set(exp, oa, lsm, &lovset); + rc = lov_prep_getattr_set(exp, oinfo, &lovset); if (rc) RETURN(rc); CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n", - lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size); + oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); list_for_each (pos, &lovset->set_list) { req = list_entry(pos, struct lov_request, rq_link); CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " - "%u\n", oa->o_id, req->rq_stripe, req->rq_oa->o_id, - req->rq_idx); + "%u\n", oinfo->oi_oa->o_id, req->rq_stripe, + req->rq_oi.oi_oa->o_id, req->rq_idx); rc = obd_getattr_async(lov->tgts[req->rq_idx].ltd_exp, - req->rq_oa, NULL, rqset); + &req->rq_oi, rqset); if (rc) { CERROR("error: getattr objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - lovset->set_oa->o_id, req->rq_oa->o_id, + oinfo->oi_oa->o_id, req->rq_oi.oi_oa->o_id, req->rq_idx, rc); GOTO(out, rc); } - lov_update_common_set(lovset, req, rc); } - LASSERT(rc == 0); - LASSERT (rqset->set_interpret == NULL); - rqset->set_interpret = lov_getattr_interpret; - rqset->set_arg = (void *)lovset; - RETURN(rc); + if (!list_empty(&rqset->set_requests)) { + LASSERT(rc == 0); + LASSERT (rqset->set_interpret == NULL); + rqset->set_interpret = lov_getattr_interpret; + rqset->set_arg = (void *)lovset; + RETURN(rc); + } out: - LASSERT(rc); - lov_fini_getattr_set(lovset); - RETURN(rc); + if (rc) + lovset->set_completes = 0; + err = lov_fini_getattr_set(lovset); + RETURN(rc ? rc : err); } -static int lov_setattr(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti) +static int lov_setattr(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti) { struct lov_request_set *set; struct lov_obd *lov; @@ -1188,33 +1197,36 @@ static int lov_setattr(struct obd_export *exp, struct obdo *src_oa, int err = 0, rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (!exp || !exp->exp_obd) RETURN(-ENODEV); /* for now, we only expect the following updates here */ - LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLFLAGS | OBD_MD_FLSIZE | OBD_MD_FLGROUP | - OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLINLINE | - OBD_MD_FLFID | OBD_MD_FLGENER))); + LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE | + OBD_MD_FLMODE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLFLAGS | OBD_MD_FLSIZE | + OBD_MD_FLGROUP | OBD_MD_FLUID | + OBD_MD_FLGID | OBD_MD_FLINLINE | + OBD_MD_FLFID | OBD_MD_FLGENER))); lov = &exp->exp_obd->u.lov; - rc = lov_prep_setattr_set(exp, src_oa, lsm, oti, &set); + rc = lov_prep_setattr_set(exp, oinfo, oti, &set); if (rc) RETURN(rc); list_for_each (pos, &set->set_list) { req = list_entry(pos, struct lov_request, rq_link); - rc = obd_setattr(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa, - NULL, NULL); + rc = obd_setattr(lov->tgts[req->rq_idx].ltd_exp, + &req->rq_oi, NULL); err = lov_update_setattr_set(set, req, rc); if (err) { CERROR("error: setattr objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - set->set_oa->o_id, req->rq_oa->o_id, - req->rq_idx, err); + set->set_oi->oi_oa->o_id, + req->rq_oi.oi_oa->o_id, req->rq_idx, err); if (!rc) rc = err; } @@ -1225,106 +1237,159 @@ static int lov_setattr(struct obd_export *exp, struct obdo *src_oa, RETURN(rc); } -static int lov_setattr_async(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti) +static int lov_setattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + lovset->set_completes = 0; + err = lov_fini_setattr_set(lovset); + RETURN(rc ? rc : err); +} + +/* If @oti is given, the request goes from MDS and responses from OSTs are not + needed. Otherwise, a client is waiting for responses. */ +static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) { + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; struct lov_obd *lov; - struct lov_oinfo *loi = NULL; - int rc = 0, err; - obd_id objid = src_oa->o_id; - int i; + int rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); - LASSERT(oti); - if (src_oa->o_valid & OBD_MD_FLCOOKIE) + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); LASSERT(oti->oti_logcookies); + } if (!exp || !exp->exp_obd) RETURN(-ENODEV); - LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLUID | - OBD_MD_FLGID| OBD_MD_FLCOOKIE | - OBD_MD_FLFID | OBD_MD_FLGENER))); lov = &exp->exp_obd->u.lov; + rc = lov_prep_setattr_set(exp, oinfo, oti, &set); + if (rc) + RETURN(rc); - loi = lsm->lsm_oinfo; - for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { - if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { - CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); - goto next; - } + CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n", + oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); - src_oa->o_id = loi->loi_id; - src_oa->o_stripe_idx = i; + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; - /* do chown/chgrp on OST asynchronously */ - err = obd_setattr_async(lov->tgts[loi->loi_ost_idx].ltd_exp, - src_oa, NULL, oti); - if (err) { + CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " + "%u\n", oinfo->oi_oa->o_id, req->rq_stripe, + req->rq_oi.oi_oa->o_id, req->rq_idx); + + rc = obd_setattr_async(lov->tgts[req->rq_idx].ltd_exp, + &req->rq_oi, oti, rqset); + if (rc) { CERROR("error: setattr objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - objid, src_oa->o_id, i, err); - if (!rc) - rc = err; + set->set_oi->oi_oa->o_id, + req->rq_oi.oi_oa->o_id, + req->rq_idx, rc); + break; } - next: - if (src_oa->o_valid & OBD_MD_FLCOOKIE) - oti->oti_logcookies++; } - RETURN(rc); + /* If we are not waiting for responses on async requests, return. */ + if (rc || !rqset || list_empty(&rqset->set_requests)) { + int err; + if (rc) + set->set_completes = 0; + err = lov_fini_setattr_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_setattr_interpret; + rqset->set_arg = (void *)set; + + RETURN(0); +} + +static int lov_punch_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + lovset->set_completes = 0; + err = lov_fini_punch_set(lovset); + RETURN(rc ? rc : err); } /* FIXME: maybe we'll just make one node the authoritative attribute node, then * we can send this 'punch' to just the authoritative node and the nodes * that the punch will affect. */ -static int lov_punch(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, - obd_off start, obd_off end, struct obd_trans_info *oti) +static int lov_punch(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) { struct lov_request_set *set; struct lov_obd *lov; struct list_head *pos; struct lov_request *req; - int err = 0, rc = 0; + int rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (!exp || !exp->exp_obd) RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_punch_set(exp, oa, lsm, start, end, oti, &set); + rc = lov_prep_punch_set(exp, oinfo, oti, &set); if (rc) RETURN(rc); list_for_each (pos, &set->set_list) { req = list_entry(pos, struct lov_request, rq_link); - rc = obd_punch(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa, - NULL, req->rq_extent.start, - req->rq_extent.end, NULL); - err = lov_update_punch_set(set, req, rc); - if (err) { + rc = obd_punch(lov->tgts[req->rq_idx].ltd_exp, + &req->rq_oi, NULL, rqset); + if (rc) { CERROR("error: punch objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", set->set_oa->o_id, - req->rq_oa->o_id, req->rq_idx, rc); - if (!rc) - rc = err; + " on OST idx %d: rc = %d\n", + set->set_oi->oi_oa->o_id, + req->rq_oi.oi_oa->o_id, req->rq_idx, rc); + break; } } - err = lov_fini_punch_set(set); - if (!rc) - rc = err; - RETURN(rc); + + if (rc || list_empty(&rqset->set_requests)) { + int err; + err = lov_fini_punch_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_punch_interpret; + rqset->set_arg = (void *)set; + + RETURN(0); } static int lov_sync(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *lsm, obd_off start, obd_off end) { struct lov_request_set *set; + struct obd_info oinfo; struct lov_obd *lov; struct list_head *pos; struct lov_request *req; @@ -1337,20 +1402,22 @@ static int lov_sync(struct obd_export *exp, struct obdo *oa, RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_sync_set(exp, oa, lsm, start, end, &set); + rc = lov_prep_sync_set(exp, &oinfo, oa, lsm, start, end, &set); if (rc) RETURN(rc); list_for_each (pos, &set->set_list) { req = list_entry(pos, struct lov_request, rq_link); - rc = obd_sync(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa, - NULL, req->rq_extent.start, req->rq_extent.end); + rc = obd_sync(lov->tgts[req->rq_idx].ltd_exp, req->rq_oi.oi_oa, + NULL, req->rq_oi.oi_policy.l_extent.start, + req->rq_oi.oi_policy.l_extent.end); err = lov_update_common_set(set, req, rc); if (err) { CERROR("error: fsync objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", set->set_oa->o_id, - req->rq_oa->o_id, req->rq_idx, rc); + " on OST idx %d: rc = %d\n", + set->set_oi->oi_oa->o_id, + req->rq_oi.oi_oa->o_id, req->rq_idx, rc); if (!rc) rc = err; } @@ -1361,20 +1428,22 @@ static int lov_sync(struct obd_export *exp, struct obdo *oa, RETURN(rc); } -static int lov_brw_check(struct lov_obd *lov, struct obdo *oa, - struct lov_stripe_md *lsm, +static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo, obd_count oa_bufs, struct brw_page *pga) { + struct obd_info oinfo = { { { 0 } } }; int i, rc = 0; + oinfo.oi_oa = lov_oinfo->oi_oa; + /* The caller just wants to know if there's a chance that this * I/O can succeed */ for (i = 0; i < oa_bufs; i++) { - int stripe = lov_stripe_number(lsm, pga[i].off); - int ost = lsm->lsm_oinfo[stripe].loi_ost_idx; + int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off); + int ost = lov_oinfo->oi_md->lsm_oinfo[stripe].loi_ost_idx; obd_off start, end; - if (!lov_stripe_intersects(lsm, i, pga[i].off, + if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off, pga[i].off + pga[i].count, &start, &end)) continue; @@ -1383,17 +1452,18 @@ static int lov_brw_check(struct lov_obd *lov, struct obdo *oa, CDEBUG(D_HA, "lov idx %d inactive\n", ost); return -EIO; } - rc = obd_brw(OBD_BRW_CHECK, lov->tgts[ost].ltd_exp, oa, - NULL, 1, &pga[i], NULL); + + rc = obd_brw(OBD_BRW_CHECK, lov->tgts[ost].ltd_exp, &oinfo, + 1, &pga[i], NULL); if (rc) break; } return rc; } -static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_trans_info *oti) +static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti) { struct lov_request_set *set; struct lov_request *req; @@ -1402,14 +1472,14 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa, int err, rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (cmd == OBD_BRW_CHECK) { - rc = lov_brw_check(lov, src_oa, lsm, oa_bufs, pga); + rc = lov_brw_check(lov, oinfo, oa_bufs, pga); RETURN(rc); } - rc = lov_prep_brw_set(exp, src_oa, lsm, oa_bufs, pga, oti, &set); + rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set); if (rc) RETURN(rc); @@ -1420,8 +1490,8 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa, sub_exp = lov->tgts[req->rq_idx].ltd_exp; sub_pga = set->set_pga + req->rq_pgaidx; - rc = obd_brw(cmd, sub_exp, req->rq_oa, req->rq_md, - req->rq_oabufs, sub_pga, oti); + rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs, + sub_pga, oti); if (rc) break; lov_update_common_set(set, req, rc); @@ -1449,10 +1519,10 @@ static int lov_brw_interpret(struct ptlrpc_request_set *reqset, void *data, RETURN(rc); } -static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct ptlrpc_request_set *set, - struct obd_trans_info *oti) +static int lov_brw_async(int cmd, struct obd_export *exp, + struct obd_info *oinfo, obd_count oa_bufs, + struct brw_page *pga, struct obd_trans_info *oti, + struct ptlrpc_request_set *set) { struct lov_request_set *lovset; struct lov_request *req; @@ -1461,14 +1531,15 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, int rc = 0; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); if (cmd == OBD_BRW_CHECK) { - rc = lov_brw_check(lov, oa, lsm, oa_bufs, pga); + rc = lov_brw_check(lov, oinfo, oa_bufs, pga); RETURN(rc); } - rc = lov_prep_brw_set(exp, oa, lsm, oa_bufs, pga, oti, &lovset); + rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &lovset); if (rc) RETURN(rc); @@ -1479,8 +1550,8 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, sub_exp = lov->tgts[req->rq_idx].ltd_exp; sub_pga = lovset->set_pga + req->rq_pgaidx; - rc = obd_brw_async(cmd, sub_exp, req->rq_oa, req->rq_md, - req->rq_oabufs, sub_pga, set, oti); + rc = obd_brw_async(cmd, sub_exp, &req->rq_oi, req->rq_oabufs, + sub_pga, oti, set); if (rc) GOTO(out, rc); lov_update_common_set(lovset, req, rc); @@ -1720,53 +1791,63 @@ static int lov_teardown_async_page(struct obd_export *exp, RETURN(rc); } -static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *bl_cb, void *cp_cb, void *gl_cb, - void *data,__u32 lvb_len, void *lvb_swabber, - struct lustre_handle *lockh) +static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + lovset->set_completes = 0; + err = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode); + RETURN(rc ? rc : err); +} + +static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo, + struct obd_enqueue_info *einfo) { struct lov_request_set *set; struct lov_request *req; struct list_head *pos; - struct lustre_handle *lov_lockhp; struct lov_obd *lov; ldlm_error_t rc; - int save_flags = *flags; ENTRY; - ASSERT_LSM_MAGIC(lsm); + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); /* we should never be asked to replay a lock this way. */ - LASSERT((*flags & LDLM_FL_REPLAY) == 0); + LASSERT((einfo->ei_flags & LDLM_FL_REPLAY) == 0); if (!exp || !exp->exp_obd) RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_enqueue_set(exp, lsm, policy, mode, lockh, &set); + rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set); if (rc) RETURN(rc); list_for_each (pos, &set->set_list) { - ldlm_policy_data_t sub_policy; req = list_entry(pos, struct lov_request, rq_link); - lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; - LASSERT(lov_lockhp); - - *flags = save_flags; - sub_policy.l_extent = req->rq_extent; - rc = obd_enqueue(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, - type, &sub_policy, mode, flags, bl_cb, - cp_cb, gl_cb, data, lvb_len, lvb_swabber, - lov_lockhp); - rc = lov_update_enqueue_set(set, req, rc, save_flags); + rc = obd_enqueue(lov->tgts[req->rq_idx].ltd_exp, + &req->rq_oi, einfo); if (rc != ELDLM_OK) - break; + GOTO(out, rc); } - lov_fini_enqueue_set(set, mode); + if (einfo->ei_rqset && !list_empty(&einfo->ei_rqset->set_requests)) { + LASSERT(rc == 0); + LASSERT(einfo->ei_rqset->set_interpret == NULL); + einfo->ei_rqset->set_interpret = lov_enqueue_interpret; + einfo->ei_rqset->set_arg = (void *)set; + RETURN(rc); + } +out: + if (rc) + set->set_completes = 0; + lov_fini_enqueue_set(set, einfo->ei_mode); RETURN(rc); } @@ -1775,6 +1856,7 @@ static int lov_match(struct obd_export *exp, struct lov_stripe_md *lsm, int *flags, void *data, struct lustre_handle *lockh) { struct lov_request_set *set; + struct obd_info oinfo; struct lov_request *req; struct list_head *pos; struct lov_obd *lov = &exp->exp_obd->u.lov; @@ -1788,7 +1870,7 @@ static int lov_match(struct obd_export *exp, struct lov_stripe_md *lsm, RETURN(-ENODEV); lov = &exp->exp_obd->u.lov; - rc = lov_prep_match_set(exp, lsm, policy, mode, lockh, &set); + rc = lov_prep_match_set(exp, &oinfo, lsm, policy, mode, lockh, &set); if (rc) RETURN(rc); @@ -1799,11 +1881,11 @@ static int lov_match(struct obd_export *exp, struct lov_stripe_md *lsm, LASSERT(lov_lockhp); lov_flags = *flags; - sub_policy.l_extent = req->rq_extent; + sub_policy.l_extent = req->rq_oi.oi_policy.l_extent; - rc = obd_match(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, - type, &sub_policy, mode, &lov_flags, data, - lov_lockhp); + rc = obd_match(lov->tgts[req->rq_idx].ltd_exp, + req->rq_oi.oi_md, type, &sub_policy, + mode, &lov_flags, data, lov_lockhp); rc = lov_update_match_set(set, req, rc); if (rc != 1) break; @@ -1842,6 +1924,7 @@ static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm, __u32 mode, struct lustre_handle *lockh) { struct lov_request_set *set; + struct obd_info oinfo; struct lov_request *req; struct list_head *pos; struct lov_obd *lov = &exp->exp_obd->u.lov; @@ -1856,7 +1939,7 @@ static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm, LASSERT(lockh); lov = &exp->exp_obd->u.lov; - rc = lov_prep_cancel_set(exp, lsm, mode, lockh, &set); + rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set); if (rc) RETURN(rc); @@ -1864,14 +1947,15 @@ static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm, req = list_entry(pos, struct lov_request, rq_link); lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; - rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, - mode, lov_lockhp); + rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, + req->rq_oi.oi_md, mode, lov_lockhp); rc = lov_update_common_set(set, req, rc); if (rc) { CERROR("error: cancel objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", lsm->lsm_object_id, - req->rq_md->lsm_object_id, req->rq_idx, rc); + req->rq_oi.oi_md->lsm_object_id, + req->rq_idx, rc); err = rc; } @@ -1966,14 +2050,62 @@ static int lov_join_lru(struct obd_export *exp, RETURN(count); } -#define LOV_U64_MAX ((__u64)~0ULL) -#define LOV_SUM_MAX(tot, add) \ - do { \ - if ((tot) + (add) < (tot)) \ - (tot) = LOV_U64_MAX; \ - else \ - (tot) += (add); \ - } while(0) +static int lov_statfs_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + lovset->set_completes = 0; + + err = lov_fini_statfs_set(lovset); + RETURN(rc ? rc : err); +} + +static int lov_statfs_async(struct obd_device *obd, struct obd_info *oinfo, + unsigned long max_age, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0; + + LASSERT(oinfo != NULL); + LASSERT(oinfo->oi_osfs != NULL); + + lov = &obd->u.lov; + rc = lov_prep_statfs_set(obd, oinfo, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + struct obd_device *osc_obd; + + req = list_entry(pos, struct lov_request, rq_link); + + osc_obd = class_exp2obd(lov->tgts[req->rq_idx].ltd_exp); + rc = obd_statfs_async(osc_obd, &req->rq_oi, max_age, rqset); + if (rc) + break; + } + + if (rc || list_empty(&rqset->set_requests)) { + int err; + if (rc) + set->set_completes = 0; + err = lov_fini_statfs_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_statfs_interpret; + rqset->set_arg = (void *)set; + RETURN(0); +} static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, cfs_time_t max_age) @@ -1981,14 +2113,12 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, struct lov_obd *lov = &obd->u.lov; struct obd_statfs lov_sfs; int set = 0; - int rc = 0; + int rc = 0, err; int i; ENTRY; - /* We only get block data from the OBD */ for (i = 0; i < lov->desc.ld_tgt_count; i++) { - int err; if (!lov->tgts[i].ltd_active) { CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; @@ -2003,53 +2133,13 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, } qos_update(lov, i, &lov_sfs); - if (!set) { - memcpy(osfs, &lov_sfs, sizeof(lov_sfs)); - set = 1; - } else { -#ifdef MIN_DF - /* Sandia requested that df (and so, statfs) only - returned minimal available space on - a single OST, so people would be able to - write this much data guaranteed. */ - if (osfs->os_bavail > lov_sfs.os_bavail) { - /* Presumably if new bavail is smaller, - new bfree is bigger as well */ - osfs->os_bfree = lov_sfs.os_bfree; - osfs->os_bavail = lov_sfs.os_bavail; - } -#else - osfs->os_bfree += lov_sfs.os_bfree; - osfs->os_bavail += lov_sfs.os_bavail; -#endif - osfs->os_blocks += lov_sfs.os_blocks; - /* XXX not sure about this one - depends on policy. - * - could be minimum if we always stripe on all OBDs - * (but that would be wrong for any other policy, - * if one of the OBDs has no more objects left) - * - could be sum if we stripe whole objects - * - could be average, just to give a nice number - * - * To give a "reasonable" (if not wholly accurate) - * number, we divide the total number of free objects - * by expected stripe count (watch out for overflow). - */ - LOV_SUM_MAX(osfs->os_files, lov_sfs.os_files); - LOV_SUM_MAX(osfs->os_ffree, lov_sfs.os_ffree); - } + lov_update_statfs(class_exp2obd(lov->tgts[i].ltd_exp), + osfs, &lov_sfs, set); + set++; } - if (set) { - __u32 expected_stripes = lov_get_stripecnt(lov, 0); - - if (osfs->os_files != LOV_U64_MAX) - do_div(osfs->os_files, expected_stripes); - if (osfs->os_ffree != LOV_U64_MAX) - do_div(osfs->os_ffree, expected_stripes); - } else if (!rc) - rc = -EIO; - - RETURN(rc); + err = lov_fini_statfs(obd, osfs, set); + RETURN(rc ? rc : err); } static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, @@ -2063,13 +2153,12 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, switch (cmd) { case OBD_IOC_LOV_GET_CONFIG: { - struct obd_ioctl_data *data = karg; + struct obd_ioctl_data *data; struct lov_tgt_desc *tgtdesc; struct lov_desc *desc; char *buf = NULL; __u32 *genp; - buf = NULL; len = 0; if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) RETURN(-EINVAL); @@ -2329,7 +2418,7 @@ static int lov_checkmd(struct obd_export *exp, struct obd_export *md_exp, LASSERT(md_exp); LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); rc = lsm_op_find(lsm->lsm_magic)->lsm_revalidate(lsm, md_exp->exp_obd); - + RETURN(rc); } @@ -2464,6 +2553,7 @@ struct obd_ops lov_obd_ops = { .o_connect = lov_connect, .o_disconnect = lov_disconnect, .o_statfs = lov_statfs, + .o_statfs_async = lov_statfs_async, .o_packmd = lov_packmd, .o_unpackmd = lov_unpackmd, .o_checkmd = lov_checkmd, @@ -2516,7 +2606,7 @@ int __init lov_init(void) rc = class_register_type(&lov_obd_ops, lvars.module_vars, LUSTRE_LOV_NAME); if (rc && quota_interface) - PORTAL_SYMBOL_PUT(osc_quota_interface); + PORTAL_SYMBOL_PUT(lov_quota_interface); RETURN(rc); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index c17240c..ec26535 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -42,7 +42,7 @@ void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) struct lov_ost_data_v1 *lod; int i; - CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n", + CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n", le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic), le32_to_cpu(lmm->lmm_pattern)); CDEBUG(level,"stripe_size %u, stripe_count %u\n", diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index eaf9f94..8d7ceba 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -379,7 +379,7 @@ static int qos_calc_rr(struct lov_obd *lov) void qos_shrink_lsm(struct lov_request_set *set) { - struct lov_stripe_md *lsm = set->set_md, *lsm_new; + struct lov_stripe_md *lsm = set->set_oi->oi_md, *lsm_new; /* XXX LOV STACKING call into osc for sizes */ unsigned oldsize, newsize; @@ -409,7 +409,7 @@ void qos_shrink_lsm(struct lov_request_set *set) memcpy(lsm_new, lsm, newsize); lsm_new->lsm_stripe_count = set->set_count; OBD_FREE(lsm, oldsize); - set->set_md = lsm_new; + set->set_oi->oi_md = lsm_new; } else { CWARN("'leaking' %d bytes\n", oldsize - newsize); } @@ -417,7 +417,7 @@ void qos_shrink_lsm(struct lov_request_set *set) int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) { - struct lov_stripe_md *lsm = set->set_md; + struct lov_stripe_md *lsm = set->set_oi->oi_md; struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; unsigned ost_idx, ost_count = lov->desc.ld_tgt_count; int stripe, i, rc = -EIO; @@ -437,8 +437,9 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) if (stripe >= lsm->lsm_stripe_count) { req->rq_idx = ost_idx; - rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa, - &req->rq_md, set->set_oti); + rc = obd_create(lov->tgts[ost_idx].ltd_exp, + req->rq_oi.oi_oa, &req->rq_oi.oi_md, + set->set_oti); if (!rc) break; } @@ -691,7 +692,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) { struct lov_obd *lov = &exp->exp_obd->u.lov; struct lov_stripe_md *lsm; - struct obdo *src_oa = set->set_oa; + struct obdo *src_oa = set->set_oi->oi_oa; struct obd_trans_info *oti = set->set_oti; int i, stripes, rc = 0, newea = 0; int *idx_arr, idx_cnt = 0; @@ -699,7 +700,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) LASSERT(src_oa->o_valid & OBD_MD_FLID); - if (set->set_md == NULL) { + if (set->set_oi->oi_md == NULL) { int stripe_cnt = lov_get_stripecnt(lov, 0); /* If the MDS file was truncated up to some size, stripe over @@ -730,7 +731,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) stripes = stripe_cnt; } - rc = lov_alloc_memmd(&set->set_md, stripes, + rc = lov_alloc_memmd(&set->set_oi->oi_md, stripes, lov->desc.ld_pattern ? lov->desc.ld_pattern : LOV_PATTERN_RAID0, LOV_MAGIC); @@ -739,8 +740,8 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) rc = 0; newea = 1; } - lsm = set->set_md; - + + lsm = set->set_oi->oi_md; lsm->lsm_object_id = src_oa->o_id; if (!lsm->lsm_stripe_size) lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size; @@ -764,30 +765,30 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) GOTO(out_err, rc = -ENOMEM); lov_set_add_req(req, set); - req->rq_buflen = sizeof(*req->rq_md); - OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) GOTO(out_err, rc = -ENOMEM); - - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) + + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) GOTO(out_err, rc = -ENOMEM); - + req->rq_idx = ost_idx; req->rq_stripe = i; /* create data objects with "parent" OA */ - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); + memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); /* XXX When we start creating objects on demand, we need to * make sure that we always create the object on the * stripe which holds the existing file size. */ if (src_oa->o_valid & OBD_MD_FLSIZE) { - req->rq_oa->o_size = + req->rq_oi.oi_oa->o_size = lov_size_to_stripe(lsm, src_oa->o_size, i); CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n", - i, req->rq_oa->o_size, src_oa->o_size); + i, req->rq_oi.oi_oa->o_size, src_oa->o_size); } } @@ -804,7 +805,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) } out_err: if (newea && rc) - obd_free_memmd(exp, &set->set_md); + obd_free_memmd(exp, &set->set_oi->oi_md); free_idx_array(idx_arr, idx_cnt); EXIT; return rc; diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index d76c4cb..28ec334 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -60,10 +60,13 @@ static void lov_finish_set(struct lov_request_set *set) rq_link); list_del_init(&req->rq_link); - if (req->rq_oa) - obdo_free(req->rq_oa); - if (req->rq_md) - OBD_FREE(req->rq_md, req->rq_buflen); + if (req->rq_oi.oi_oa) + obdo_free(req->rq_oi.oi_oa); + if (req->rq_oi.oi_md) + OBD_FREE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_osfs) + OBD_FREE(req->rq_oi.oi_osfs, + sizeof(*req->rq_oi.oi_osfs)); OBD_FREE(req, sizeof(*req)); } @@ -78,8 +81,8 @@ static void lov_finish_set(struct lov_request_set *set) EXIT; } -static void lov_update_set(struct lov_request_set *set, - struct lov_request *req, int rc) +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc) { req->rq_complete = 1; req->rq_rc = rc; @@ -111,30 +114,32 @@ void lov_set_add_req(struct lov_request *req, struct lov_request_set *set) set->set_count++; } -int lov_update_enqueue_set(struct lov_request_set *set, - struct lov_request *req, int rc, int flags) +int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc) { + struct lov_request_set *set = req->rq_rqset; struct lustre_handle *lov_lockhp; struct lov_oinfo *loi; ENTRY; + LASSERT(set != NULL); + LASSERT(set->set_oi != NULL); + lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; - loi = &set->set_md->lsm_oinfo[req->rq_stripe]; + loi = &set->set_oi->oi_md->lsm_oinfo[req->rq_stripe]; - /* XXX FIXME: This unpleasantness doesn't belong here at *all*. - * It belongs in the OSC, except that the OSC doesn't have - * access to the real LOI -- it gets a copy, that we created - * above, and that copy can be arbitrarily out of date. + /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set + * and that copy can be arbitrarily out of date. * * The LOV API is due for a serious rewriting anyways, and this * can be addressed then. */ + if (rc == ELDLM_OK) { struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); __u64 tmp; LASSERT(lock != NULL); - lov_stripe_lock(set->set_md); - loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb; + lov_stripe_lock(set->set_oi->oi_md); + loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo->loi_lvb; tmp = loi->loi_lvb.lvb_size; /* Extend KMS up to the end of this lock and no further * A lock on [x,y] means a KMS of up to y + 1 bytes! */ @@ -151,14 +156,15 @@ int lov_update_enqueue_set(struct lov_request_set *set, loi->loi_lvb.lvb_size, loi->loi_kms, lock->l_policy_data.l_extent.end); } - lov_stripe_unlock(set->set_md); + lov_stripe_unlock(set->set_oi->oi_md); ldlm_lock_allow_match(lock); LDLM_LOCK_PUT(lock); - } else if (rc == ELDLM_LOCK_ABORTED && flags & LDLM_FL_HAS_INTENT) { + } else if ((rc == ELDLM_LOCK_ABORTED) && + (set->set_ei->ei_flags & LDLM_FL_HAS_INTENT)) { memset(lov_lockhp, 0, sizeof(*lov_lockhp)); - lov_stripe_lock(set->set_md); - loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb; - lov_stripe_unlock(set->set_md); + lov_stripe_lock(set->set_oi->oi_md); + loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo->loi_lvb; + lov_stripe_unlock(set->set_oi->oi_md); CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms); rc = ELDLM_OK; @@ -170,8 +176,8 @@ int lov_update_enqueue_set(struct lov_request_set *set, if (lov->tgts[req->rq_idx].ltd_active) { CERROR("error: enqueue objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - set->set_md->lsm_object_id, loi->loi_id, - loi->loi_ost_idx, rc); + set->set_oi->oi_md->lsm_object_id, + loi->loi_id, loi->loi_ost_idx, rc); } else { rc = ELDLM_OK; } @@ -180,10 +186,20 @@ int lov_update_enqueue_set(struct lov_request_set *set, RETURN(rc); } +/* The callback for osc_enqueue that updates lov info for every OSC request. */ +static int cb_update_enqueue(struct obd_info *oinfo, int rc) +{ + struct obd_enqueue_info *einfo; + struct lov_request *lovreq; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + einfo = lovreq->rq_rqset->set_ei; + return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc); +} + static int enqueue_done(struct lov_request_set *set, __u32 mode) { struct lov_request *req; - struct lustre_handle *lov_lockhp = NULL; struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; int rc = 0; ENTRY; @@ -195,6 +211,8 @@ static int enqueue_done(struct lov_request_set *set, __u32 mode) /* cancel enqueued/matched locks */ list_for_each_entry(req, &set->set_list, rq_link) { + struct lustre_handle *lov_lockhp; + if (!req->rq_complete || req->rq_rc) continue; @@ -203,12 +221,13 @@ static int enqueue_done(struct lov_request_set *set, __u32 mode) if (!lustre_handle_is_used(lov_lockhp)) continue; - rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, - mode, lov_lockhp); + rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, + req->rq_oi.oi_md, mode, lov_lockhp); if (rc && lov->tgts[req->rq_idx].ltd_active) CERROR("cancelling obdjid "LPX64" on OST " "idx %d error: rc = %d\n", - req->rq_md->lsm_object_id, req->rq_idx, rc); + req->rq_oi.oi_md->lsm_object_id, + req->rq_idx, rc); } lov_llh_put(set->set_lockh); RETURN(rc); @@ -222,7 +241,9 @@ int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode) if (set == NULL) RETURN(0); LASSERT(set->set_exp); - if (set->set_completes) + /* Do enqueue_done only for sync requests and if any request + succeeded. */ + if (!set->set_ei->ei_rqset && set->set_completes) rc = enqueue_done(set, mode); else lov_llh_put(set->set_lockh); @@ -233,9 +254,8 @@ int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode) RETURN(rc); } -int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_policy_data_t *policy, __u32 mode, - struct lustre_handle *lockh, +int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_enqueue_info *einfo, struct lov_request_set **reqset) { struct lov_obd *lov = &exp->exp_obd->u.lov; @@ -250,19 +270,22 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_lockh = lov_llh_new(lsm); + set->set_oi = oinfo; + set->set_ei = einfo; + set->set_lockh = lov_llh_new(oinfo->oi_md); if (set->set_lockh == NULL) GOTO(out_set, rc = -ENOMEM); - lockh->cookie = set->set_lockh->llh_handle.h_cookie; + oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie; - loi = lsm->lsm_oinfo; - for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { + loi = oinfo->oi_md->lsm_oinfo; + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) { struct lov_request *req; obd_off start, end; - if (!lov_stripe_intersects(lsm, i, policy->l_extent.start, - policy->l_extent.end, &start, &end)) + if (!lov_stripe_intersects(oinfo->oi_md, i, + oinfo->oi_policy.l_extent.start, + oinfo->oi_policy.l_extent.end, + &start, &end)) continue; if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { @@ -274,27 +297,36 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, if (req == NULL) GOTO(out_set, rc = -ENOMEM); - req->rq_buflen = sizeof(*req->rq_md) + + req->rq_buflen = sizeof(*req->rq_oi.oi_md) + sizeof(struct lov_oinfo); - OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) { + OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - req->rq_extent.start = start; - req->rq_extent.end = end; - req->rq_extent.gid = policy->l_extent.gid; + req->rq_rqset = set; + /* Set lov request specific parameters. */ + req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i; + req->rq_oi.oi_cb_up = cb_update_enqueue; + + LASSERT(req->rq_oi.oi_lockh); + + req->rq_oi.oi_policy.l_extent.gid = + oinfo->oi_policy.l_extent.gid; + req->rq_oi.oi_policy.l_extent.start = start; + req->rq_oi.oi_policy.l_extent.end = end; req->rq_idx = loi->loi_ost_idx; req->rq_stripe = i; /* XXX LOV STACKING: submd should be from the subobj */ - req->rq_md->lsm_object_id = loi->loi_id; - req->rq_md->lsm_stripe_count = 0; - req->rq_md->lsm_oinfo->loi_kms_valid = loi->loi_kms_valid; - req->rq_md->lsm_oinfo->loi_kms = loi->loi_kms; - req->rq_md->lsm_oinfo->loi_lvb = loi->loi_lvb; + req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_stripe_count = 0; + req->rq_oi.oi_md->lsm_oinfo->loi_kms_valid = + loi->loi_kms_valid; + req->rq_oi.oi_md->lsm_oinfo->loi_kms = loi->loi_kms; + req->rq_oi.oi_md->lsm_oinfo->loi_lvb = loi->loi_lvb; lov_set_add_req(req, set); } @@ -303,7 +335,7 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, *reqset = set; RETURN(0); out_set: - lov_fini_enqueue_set(set, mode); + lov_fini_enqueue_set(set, einfo->ei_mode); RETURN(rc); } @@ -342,9 +374,9 @@ int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags) RETURN(rc); } -int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_policy_data_t *policy, __u32 mode, - struct lustre_handle *lockh, +int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, ldlm_policy_data_t *policy, + __u32 mode, struct lustre_handle *lockh, struct lov_request_set **reqset) { struct lov_obd *lov = &exp->exp_obd->u.lov; @@ -359,7 +391,8 @@ int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; set->set_lockh = lov_llh_new(lsm); if (set->set_lockh == NULL) GOTO(out_set, rc = -ENOMEM); @@ -383,23 +416,23 @@ int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm, if (req == NULL) GOTO(out_set, rc = -ENOMEM); - req->rq_buflen = sizeof(*req->rq_md); - OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) { + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - req->rq_extent.start = start; - req->rq_extent.end = end; - req->rq_extent.gid = policy->l_extent.gid; + req->rq_oi.oi_policy.l_extent.start = start; + req->rq_oi.oi_policy.l_extent.end = end; + req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid; req->rq_idx = loi->loi_ost_idx; req->rq_stripe = i; /* XXX LOV STACKING: submd should be from the subobj */ - req->rq_md->lsm_object_id = loi->loi_id; - req->rq_md->lsm_stripe_count = 0; + req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_stripe_count = 0; lov_set_add_req(req, set); } @@ -430,8 +463,9 @@ int lov_fini_cancel_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, - __u32 mode, struct lustre_handle *lockh, +int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, __u32 mode, + struct lustre_handle *lockh, struct lov_request_set **reqset) { struct lov_request_set *set; @@ -445,7 +479,8 @@ int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; set->set_lockh = lov_handle2llh(lockh); if (set->set_lockh == NULL) { CERROR("LOV: invalid lov lock handle %p\n", lockh); @@ -468,9 +503,9 @@ int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, if (req == NULL) GOTO(out_set, rc = -ENOMEM); - req->rq_buflen = sizeof(*req->rq_md); - OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) { + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } @@ -479,8 +514,8 @@ int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, req->rq_stripe = i; /* XXX LOV STACKING: submd should be from the subobj */ - req->rq_md->lsm_object_id = loi->loi_id; - req->rq_md->lsm_stripe_count = 0; + req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_stripe_count = 0; lov_set_add_req(req, set); } @@ -498,7 +533,7 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set, { struct lov_obd *lov = &exp->exp_obd->u.lov; struct obd_trans_info *oti = set->set_oti; - struct obdo *src_oa = set->set_oa; + struct obdo *src_oa = set->set_oi->oi_oa; struct lov_request *req; struct obdo *ret_oa = NULL; int attrset = 0, rc = 0; @@ -512,10 +547,10 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set, list_for_each_entry (req, &set->set_list, rq_link) { if (req->rq_rc == 0) continue; - + set->set_completes--; req->rq_complete = 0; - + rc = qos_remedy_create(set, req); lov_update_create_set(set, req, rc); @@ -527,7 +562,7 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set, /* no successful creates */ if (set->set_success == 0) GOTO(cleanup, rc); - + /* If there was an explicit stripe set, fail. Otherwise, we * got some objects and that's not bad. */ if (set->set_count != set->set_success) { @@ -544,8 +579,9 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set, list_for_each_entry(req, &set->set_list, rq_link) { if (!req->rq_complete || req->rq_rc) continue; - lov_merge_attrs(ret_oa, req->rq_oa, req->rq_oa->o_valid, - set->set_md, req->rq_stripe, &attrset); + lov_merge_attrs(ret_oa, req->rq_oi.oi_oa, + req->rq_oi.oi_oa->o_valid, set->set_oi->oi_md, + req->rq_stripe, &attrset); } if (src_oa->o_valid & OBD_MD_FLSIZE && ret_oa->o_size != src_oa->o_size) { @@ -557,7 +593,7 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set, memcpy(src_oa, ret_oa, sizeof(*src_oa)); obdo_free(ret_oa); - *lsmp = set->set_md; + *lsmp = set->set_oi->oi_md; GOTO(done, rc = 0); cleanup: @@ -569,15 +605,15 @@ cleanup: continue; sub_exp = lov->tgts[req->rq_idx].ltd_exp; - err = obd_destroy(sub_exp, req->rq_oa, NULL, oti, NULL); + err = obd_destroy(sub_exp, req->rq_oi.oi_oa, NULL, oti, NULL); if (err) CERROR("Failed to uncreate objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", - set->set_oa->o_id, req->rq_oa->o_id, + src_oa->o_id, req->rq_oi.oi_oa->o_id, req->rq_idx, rc); } if (*lsmp == NULL) - obd_free_memmd(exp, &set->set_md); + obd_free_memmd(exp, &set->set_oi->oi_md); done: if (oti && set->set_cookies) { oti->oti_logcookies = set->set_cookies; @@ -612,7 +648,7 @@ int lov_update_create_set(struct lov_request_set *set, struct lov_request *req, int rc) { struct obd_trans_info *oti = set->set_oti; - struct lov_stripe_md *lsm = set->set_md; + struct lov_stripe_md *lsm = set->set_oi->oi_md; struct lov_oinfo *loi; struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; ENTRY; @@ -623,7 +659,7 @@ int lov_update_create_set(struct lov_request_set *set, if (rc && lov->tgts[req->rq_idx].ltd_active) { CERROR("error creating fid "LPX64" sub-object" " on OST idx %d/%d: rc = %d\n", - set->set_oa->o_id, req->rq_idx, + set->set_oi->oi_oa->o_id, req->rq_idx, lsm->lsm_stripe_count, rc); if (rc > 0) { CERROR("obd_create returned invalid err %d\n", rc); @@ -635,9 +671,9 @@ int lov_update_create_set(struct lov_request_set *set, RETURN(rc); if (oti && oti->oti_objid) - oti->oti_objid[req->rq_idx] = req->rq_oa->o_id; + oti->oti_objid[req->rq_idx] = req->rq_oi.oi_oa->o_id; - loi->loi_id = req->rq_oa->o_id; + loi->loi_id = req->rq_oi.oi_oa->o_id; loi->loi_ost_idx = req->rq_idx; CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64"/"LPX64" at idx %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_id, req->rq_idx); @@ -645,14 +681,15 @@ int lov_update_create_set(struct lov_request_set *set, if (oti && set->set_cookies) ++oti->oti_logcookies; - if (req->rq_oa->o_valid & OBD_MD_FLCOOKIE) + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCOOKIE) set->set_cookie_sent++; RETURN(0); } -int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp, - struct obdo *src_oa, struct obd_trans_info *oti, +int lov_prep_create_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md **lsmp, struct obdo *src_oa, + struct obd_trans_info *oti, struct lov_request_set **reqset) { struct lov_request_set *set; @@ -665,8 +702,9 @@ int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp, lov_init_set(set); set->set_exp = exp; - set->set_md = *lsmp; - set->set_oa = src_oa; + set->set_oi = oinfo; + set->set_oi->oi_md = *lsmp; + set->set_oi->oi_oa = src_oa; set->set_oti = oti; rc = qos_prep_create(exp, set); @@ -685,7 +723,9 @@ static int common_attr_done(struct lov_request_set *set) int rc = 0, attrset = 0; ENTRY; - if (set->set_oa == NULL) + LASSERT(set->set_oi != NULL); + + if (set->set_oi->oi_oa == NULL) RETURN(0); if (!set->set_success) @@ -700,17 +740,18 @@ static int common_attr_done(struct lov_request_set *set) if (!req->rq_complete || req->rq_rc) continue; - if (req->rq_oa->o_valid == 0) /* inactive stripe */ + if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */ continue; - lov_merge_attrs(tmp_oa, req->rq_oa, req->rq_oa->o_valid, - set->set_md, req->rq_stripe, &attrset); + lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa, + req->rq_oi.oi_oa->o_valid, + set->set_oi->oi_md, req->rq_stripe, &attrset); } if (!attrset) { CERROR("No stripes had valid attrs\n"); rc = -EIO; } - tmp_oa->o_id = set->set_oa->o_id; - memcpy(set->set_oa, tmp_oa, sizeof(*set->set_oa)); + tmp_oa->o_id = set->set_oi->oi_oa->o_id; + memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa)); out: if (tmp_oa) obdo_free(tmp_oa); @@ -720,7 +761,7 @@ out: static int brw_done(struct lov_request_set *set) { - struct lov_stripe_md *lsm = set->set_md; + struct lov_stripe_md *lsm = set->set_oi->oi_md; struct lov_oinfo *loi = NULL; struct list_head *pos; struct lov_request *req; @@ -734,8 +775,8 @@ static int brw_done(struct lov_request_set *set) loi = &lsm->lsm_oinfo[req->rq_stripe]; - if (req->rq_oa->o_valid & OBD_MD_FLBLOCKS) - loi->loi_lvb.lvb_blocks = req->rq_oa->o_blocks; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) + loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks; } RETURN(0); @@ -759,9 +800,9 @@ int lov_fini_brw_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_trans_info *oti, +int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti, struct lov_request_set **reqset) { struct { @@ -781,27 +822,27 @@ int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; set->set_oti = oti; + set->set_oi = oinfo; set->set_oabufs = oa_bufs; OBD_ALLOC(set->set_pga, oa_bufs * sizeof(*set->set_pga)); if (!set->set_pga) GOTO(out, rc = -ENOMEM); - OBD_ALLOC(info, sizeof(*info) * lsm->lsm_stripe_count); + OBD_ALLOC(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count); if (!info) GOTO(out, rc = -ENOMEM); /* calculate the page count for each stripe */ for (i = 0; i < oa_bufs; i++) { - int stripe = lov_stripe_number(lsm, pga[i].off); + int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off); info[stripe].count++; } /* alloc and initialize lov request */ shift = 0; - for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, loi++){ + for (i = 0, loi = oinfo->oi_md->lsm_oinfo; + i < oinfo->oi_md->lsm_stripe_count; i++, loi++){ struct lov_request *req; if (info[i].count == 0) @@ -816,21 +857,23 @@ int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, if (req == NULL) GOTO(out, rc = -ENOMEM); - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out, rc = -ENOMEM); } - if (src_oa) - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; - req->rq_oa->o_stripe_idx = i; + if (oinfo->oi_oa) { + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + } + req->rq_oi.oi_oa->o_id = loi->loi_id; + req->rq_oi.oi_oa->o_stripe_idx = i; - req->rq_buflen = sizeof(*req->rq_md); - OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) { - obdo_free(req->rq_oa); + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { + obdo_free(req->rq_oi.oi_oa); OBD_FREE(req, sizeof(*req)); GOTO(out, rc = -ENOMEM); } @@ -839,8 +882,8 @@ int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; /* XXX LOV STACKING */ - req->rq_md->lsm_object_id = loi->loi_id; - req->rq_md->lsm_object_gr = lsm->lsm_object_gr; + req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_object_gr = oinfo->oi_md->lsm_object_gr; req->rq_oabufs = info[i].count; req->rq_pgaidx = shift; shift += req->rq_oabufs; @@ -855,18 +898,18 @@ int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, /* rotate & sort the brw_page array */ for (i = 0; i < oa_bufs; i++) { - int stripe = lov_stripe_number(lsm, pga[i].off); + int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off); shift = info[stripe].index + info[stripe].off; LASSERT(shift < oa_bufs); set->set_pga[shift] = pga[i]; - lov_stripe_offset(lsm, pga[i].off, stripe, + lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe, &set->set_pga[shift].off); info[stripe].off++; } out: if (info) - OBD_FREE(info, sizeof(*info) * lsm->lsm_stripe_count); + OBD_FREE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count); if (rc == 0) *reqset = set; @@ -893,8 +936,16 @@ int lov_fini_getattr_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, +/* The callback for osc_getattr_async that finilizes a request info when a + * response is recieved. */ +static int cb_getattr_update(struct obd_info *oinfo, int rc) +{ + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, struct lov_request_set **reqset) { struct lov_request_set *set; @@ -909,11 +960,10 @@ int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; + set->set_oi = oinfo; - loi = lsm->lsm_oinfo; - for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { + loi = oinfo->oi_md->lsm_oinfo; + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) { struct lov_request *req; if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { @@ -928,13 +978,15 @@ int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; req->rq_idx = loi->loi_ost_idx; - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_id = loi->loi_id; + req->rq_oi.oi_cb_up = cb_getattr_update; lov_set_add_req(req, set); } @@ -964,15 +1016,15 @@ int lov_fini_destroy_set(struct lov_request_set *set) RETURN(0); } -int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, struct obd_trans_info *oti, struct lov_request_set **reqset) { struct lov_request_set *set; struct lov_oinfo *loi = NULL; struct lov_obd *lov = &exp->exp_obd->u.lov; - int rc = 0, cookie_set = 0, i; + int rc = 0, i; ENTRY; OBD_ALLOC(set, sizeof(*set)); @@ -981,8 +1033,9 @@ int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_oi->oi_oa = src_oa; set->set_oti = oti; if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE) set->set_cookies = oti->oti_logcookies; @@ -1003,19 +1056,13 @@ int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; req->rq_idx = loi->loi_ost_idx; - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; - - /* Setup the first request's cookie position */ - if (oti && !cookie_set && set->set_cookies) { - oti->oti_logcookies = set->set_cookies + i; - cookie_set = 1; - } + memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_id = loi->loi_id; lov_set_add_req(req, set); } if (!set->set_count) @@ -1045,8 +1092,45 @@ int lov_fini_setattr_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti, +int lov_update_setattr_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov; + struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md; + ENTRY; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !lov->tgts[req->rq_idx].ltd_active) + rc = 0; + + if (rc == 0) { + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME) + lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_ctime = + req->rq_oi.oi_oa->o_ctime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME) + lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_mtime = + req->rq_oi.oi_oa->o_mtime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME) + lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_atime = + req->rq_oi.oi_oa->o_atime; + } + + RETURN(rc); +} + +/* The callback for osc_setattr_async that finilizes a request info when a + * response is recieved. */ +static int cb_setattr_update(struct obd_info *oinfo, int rc) +{ + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, struct lov_request_set **reqset) { struct lov_request_set *set; @@ -1061,11 +1145,13 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; + set->set_oti = oti; + set->set_oi = oinfo; + if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + set->set_cookies = oti->oti_logcookies; - loi = lsm->lsm_oinfo; - for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { + loi = oinfo->oi_md->lsm_oinfo; + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) { struct lov_request *req; if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { @@ -1079,22 +1165,29 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; req->rq_idx = loi->loi_ost_idx; - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; - req->rq_oa->o_stripe_idx = i; - - if (src_oa->o_valid & OBD_MD_FLSIZE) { - if (lov_stripe_offset(lsm, src_oa->o_size, i, - &req->rq_oa->o_size) < 0 && - req->rq_oa->o_size) - req->rq_oa->o_size--; + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_id = loi->loi_id; + req->rq_oi.oi_oa->o_stripe_idx = i; + req->rq_oi.oi_cb_up = cb_setattr_update; + req->rq_rqset = set; + + if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) { + int off = lov_stripe_offset(oinfo->oi_md, + oinfo->oi_oa->o_size, i, + &req->rq_oi.oi_oa->o_size); + + if (off < 0 && req->rq_oi.oi_oa->o_size) + req->rq_oi.oi_oa->o_size--; + CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n", - i, req->rq_oa->o_size, src_oa->o_size); + i, req->rq_oi.oi_oa->o_size, + oinfo->oi_oa->o_size); } lov_set_add_req(req, set); } @@ -1107,49 +1200,6 @@ out_set: RETURN(rc); } -int lov_update_setattr_set(struct lov_request_set *set, - struct lov_request *req, int rc) -{ - struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; - struct lov_stripe_md *lsm = set->set_md; - ENTRY; - - lov_update_set(set, req, rc); - - /* grace error on inactive ost */ - if (rc && !lov->tgts[req->rq_idx].ltd_active) - rc = 0; - - /* FIXME: LOV STACKING update loi data should be done by OSC * - * when this is gone we can go back to using lov_update_common_set() */ - if (rc == 0) { - if (req->rq_oa->o_valid & OBD_MD_FLMTIME) - lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_ctime = - req->rq_oa->o_ctime; - if (req->rq_oa->o_valid & OBD_MD_FLMTIME) - lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_mtime = - req->rq_oa->o_mtime; - if (req->rq_oa->o_valid & OBD_MD_FLATIME) - lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_atime = - req->rq_oa->o_atime; - } - - RETURN(rc); -} - -int lov_update_punch_set(struct lov_request_set *set, struct lov_request *req, - int rc) -{ - struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; - ENTRY; - - lov_update_set(set, req, rc); - if (rc && !lov->tgts[req->rq_idx].ltd_active) - rc = 0; - /* FIXME in raid1 regime, should return 0 */ - RETURN(rc); -} - int lov_fini_punch_set(struct lov_request_set *set) { int rc = 0; @@ -1170,9 +1220,17 @@ int lov_fini_punch_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_off start, - obd_off end, struct obd_trans_info *oti, +/* The callback for osc_punch that finilizes a request info when a response + * is recieved. */ +static int cb_update_punch(struct obd_info *oinfo, int rc) +{ + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, struct lov_request_set **reqset) { struct lov_request_set *set; @@ -1186,12 +1244,11 @@ int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, RETURN(-ENOMEM); lov_init_set(set); + set->set_oi = oinfo; set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; - loi = lsm->lsm_oinfo; - for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { + loi = oinfo->oi_md->lsm_oinfo; + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++, loi++) { struct lov_request *req; obd_off rs, re; @@ -1200,7 +1257,10 @@ int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, continue; } - if (!lov_stripe_intersects(lsm, i, start, end, &rs, &re)) + if (!lov_stripe_intersects(oinfo->oi_md, i, + oinfo->oi_policy.l_extent.start, + oinfo->oi_policy.l_extent.end, + &rs, &re)) continue; OBD_ALLOC(req, sizeof(*req)); @@ -1209,18 +1269,21 @@ int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; req->rq_idx = loi->loi_ost_idx; - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; - req->rq_oa->o_stripe_idx = i; + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_id = loi->loi_id; + req->rq_oi.oi_oa->o_stripe_idx = i; + req->rq_oi.oi_cb_up = cb_update_punch; + req->rq_rqset = set; - req->rq_extent.start = rs; - req->rq_extent.end = re; - req->rq_extent.gid = -1; + req->rq_oi.oi_policy.l_extent.start = rs; + req->rq_oi.oi_policy.l_extent.end = re; + req->rq_oi.oi_policy.l_extent.gid = -1; lov_set_add_req(req, set); } @@ -1253,9 +1316,10 @@ int lov_fini_sync_set(struct lov_request_set *set) RETURN(rc); } -int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md *lsm, obd_off start, - obd_off end, struct lov_request_set **reqset) +int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, + obd_off start, obd_off end, + struct lov_request_set **reqset) { struct lov_request_set *set; struct lov_oinfo *loi = NULL; @@ -1269,8 +1333,9 @@ int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa, lov_init_set(set); set->set_exp = exp; - set->set_md = lsm; - set->set_oa = src_oa; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_oi->oi_oa = src_oa; loi = lsm->lsm_oinfo; for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { @@ -1291,18 +1356,18 @@ int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa, req->rq_stripe = i; req->rq_idx = loi->loi_ost_idx; - req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { + req->rq_oi.oi_oa = obdo_alloc(); + if (req->rq_oi.oi_oa == NULL) { OBD_FREE(req, sizeof(*req)); GOTO(out_set, rc = -ENOMEM); } - memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa)); - req->rq_oa->o_id = loi->loi_id; - req->rq_oa->o_stripe_idx = i; + memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_id = loi->loi_id; + req->rq_oi.oi_oa->o_stripe_idx = i; - req->rq_extent.start = rs; - req->rq_extent.end = re; - req->rq_extent.gid = -1; + req->rq_oi.oi_policy.l_extent.start = rs; + req->rq_oi.oi_policy.l_extent.end = re; + req->rq_oi.oi_policy.l_extent.gid = -1; lov_set_add_req(req, set); } @@ -1314,3 +1379,179 @@ out_set: lov_fini_sync_set(set); RETURN(rc); } + +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while(0) + +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success) +{ + ENTRY; + + if (success) { + __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, 0); + + if (osfs->os_files != LOV_U64_MAX) + do_div(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + do_div(osfs->os_ffree, expected_stripes); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(osfs)); + obd->obd_osfs_age = jiffies; + spin_unlock(&obd->obd_osfs_lock); + RETURN(0); + } + + RETURN(-EIO); +} + +int lov_fini_statfs_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + + if (set->set_completes) { + rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, + set->set_success); + } + + if (atomic_dec_and_test(&set->set_refcount)) + lov_finish_set(set); + + RETURN(rc); +} + +void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs, + struct obd_statfs *lov_sfs, int success) +{ + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, lov_sfs, sizeof(osfs)); + obd->obd_osfs_age = jiffies; + spin_unlock(&obd->obd_osfs_lock); + + if (success == 0) { + memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); + } else { +#ifdef MIN_DF + /* Sandia requested that df (and so, statfs) only + returned minimal available space on + a single OST, so people would be able to + write this much data guaranteed. */ + if (osfs->os_bavail > lov_sfs->os_bavail) { + /* Presumably if new bavail is smaller, + new bfree is bigger as well */ + osfs->os_bfree = lov_sfs->os_bfree; + osfs->os_bavail = lov_sfs->os_bavail; + } +#else + osfs->os_bfree += lov_sfs->os_bfree; + osfs->os_bavail += lov_sfs->os_bavail; +#endif + osfs->os_blocks += lov_sfs->os_blocks; + /* XXX not sure about this one - depends on policy. + * - could be minimum if we always stripe on all OBDs + * (but that would be wrong for any other policy, + * if one of the OBDs has no more objects left) + * - could be sum if we stripe whole objects + * - could be average, just to give a nice number + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). + */ + LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); + } +} + +/* The callback for osc_statfs_async that finilizes a request info when a + * response is recieved. */ +static int cb_statfs_update(struct obd_info *oinfo, int rc) +{ + struct lov_request *lovreq; + struct obd_statfs *osfs, *lov_sfs; + struct obd_device *obd; + struct lov_obd *lov; + int success; + ENTRY; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + lov = &lovreq->rq_rqset->set_obd->u.lov; + obd = class_exp2obd(lov->tgts[lovreq->rq_idx].ltd_exp); + + osfs = lovreq->rq_rqset->set_oi->oi_osfs; + lov_sfs = oinfo->oi_osfs; + + success = lovreq->rq_rqset->set_success; + + /* XXX: the same is done in lov_update_common_set, however + lovset->set_exp is not initialized. */ + lov_update_set(lovreq->rq_rqset, lovreq, rc); + if (rc) { + if (rc && !lov->tgts[lovreq->rq_idx].ltd_active) + rc = 0; + RETURN(rc); + } + + lov_update_statfs(obd, osfs, lov_sfs, success); + RETURN(0); +} + +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_obd = obd; + set->set_oi = oinfo; + + /* We only get block data from the OBD */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_request *req; + + if (lov->tgts[i].ltd_active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); + if (req->rq_oi.oi_osfs == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_statfs_update; + req->rq_rqset = set; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_statfs_set(set); + RETURN(rc); +} diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 5bf7a39..acf65da 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -54,6 +54,7 @@ #include #endif #include +#include #ifdef EXT3_MULTIBLOCK_ALLOCATOR #include @@ -462,7 +463,7 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, struct iattr *iattr, int do_trunc) { struct inode *inode = dentry->d_inode; - int rc; + int rc = 0; lock_kernel(); @@ -476,12 +477,26 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, iattr->ia_valid &= ~ATTR_SIZE; EXT3_I(inode)->i_disksize = inode->i_size = iattr->ia_size; - /* make sure _something_ gets set - so new inode - * goes to disk (probably won't work over XFS */ - if (!(iattr->ia_valid & (ATTR_MODE | ATTR_MTIME | ATTR_CTIME))){ - iattr->ia_valid |= ATTR_MTIME; - iattr->ia_mtime = inode->i_mtime; + if (iattr->ia_valid & ATTR_UID) + inode->i_uid = iattr->ia_uid; + if (iattr->ia_valid & ATTR_GID) + inode->i_gid = iattr->ia_gid; + if (iattr->ia_valid & ATTR_ATIME) + inode->i_atime = iattr->ia_atime; + if (iattr->ia_valid & ATTR_MTIME) + inode->i_mtime = iattr->ia_mtime; + if (iattr->ia_valid & ATTR_CTIME) + inode->i_ctime = iattr->ia_ctime; + if (iattr->ia_valid & ATTR_MODE) { + inode->i_mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + inode->i_mode &= ~S_ISGID; } + + inode->i_sb->s_op->dirty_inode(inode); + + goto out; } /* Don't allow setattr to change file type */ @@ -501,8 +516,9 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, rc = inode_setattr(inode, iattr); } + out: unlock_kernel(); - return rc; + RETURN(rc); } static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 2b010a9..73d7747 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -389,10 +389,9 @@ int mdc_enqueue(struct obd_export *exp, ptlrpc_req_set_repsize(req, repbufcnt, repsize); mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); - rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, res_id, - lock_type, &policy,lock_mode, &flags, cb_blocking, - cb_completion, NULL, cb_data, NULL, 0, NULL, - lockh); + rc = ldlm_cli_enqueue(exp, &req, res_id, lock_type, &policy, + lock_mode, &flags, cb_blocking, cb_completion, + NULL, cb_data, NULL, 0, NULL, lockh, 0); mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); /* Similarly, if we're going to replay this request, we don't want to diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index dcf5e82..9eb7dc7 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -183,10 +183,10 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, res_id.name[0] = de->d_inode->i_ino; res_id.name[1] = de->d_inode->i_generation; - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id, - LDLM_IBITS, &policy, lock_mode, &flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, + LDLM_IBITS, &policy, lock_mode, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + NULL, NULL, 0, NULL, lockh); if (rc != ELDLM_OK) { l_dput(de); retval = ERR_PTR(-EIO); /* XXX translate ldlm code */ @@ -1767,8 +1767,7 @@ int mds_update_server_data(struct obd_device *obd, int force_sync) RETURN(rc); } -static -void fsoptions_to_mds_flags(struct mds_obd *mds, char *options) +static void fsoptions_to_mds_flags(struct mds_obd *mds, char *options) { char *p = options; @@ -1783,22 +1782,24 @@ void fsoptions_to_mds_flags(struct mds_obd *mds, char *options) memcmp(options, "user_xattr", len) == 0) { mds->mds_fl_user_xattr = 1; } else if (len == sizeof("nouser_xattr") - 1 && - memcmp(options, "nouser_xattr", len) == 0) { + memcmp(options, "nouser_xattr", len) == 0) { mds->mds_fl_user_xattr = 0; } else if (len == sizeof("acl") - 1 && - memcmp(options, "acl", len) == 0) { + memcmp(options, "acl", len) == 0) { #ifdef CONFIG_FS_POSIX_ACL mds->mds_fl_acl = 1; #else CWARN("ignoring unsupported acl mount option\n"); memmove(options, p, strlen(p) + 1); + p = options; #endif } else if (len == sizeof("noacl") - 1 && - memcmp(options, "noacl", len) == 0) { + memcmp(options, "noacl", len) == 0) { #ifdef CONFIG_FS_POSIX_ACL mds->mds_fl_acl = 0; #else memmove(options, p, strlen(p) + 1); + p = options; #endif } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index f0bd52e..0942b54 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -312,11 +312,10 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, { struct inode *inode = dchild->d_inode; struct obd_trans_info oti = { 0 }; - struct lov_stripe_md *lsm = NULL; struct lov_mds_md *lmm = NULL; int rc, lmm_size; struct mds_body *body; - struct obdo *oa; + struct obd_info oinfo = { { { 0 } } }; void *lmm_buf; ENTRY; @@ -372,18 +371,18 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_ALLOC_OBDO)) GOTO(out_ids, rc = -ENOMEM); - oa = obdo_alloc(); - if (oa == NULL) + oinfo.oi_oa = obdo_alloc(); + if (oinfo.oi_oa == NULL) GOTO(out_ids, rc = -ENOMEM); - oa->o_uid = 0; /* must have 0 uid / gid on OST */ - oa->o_gid = 0; - oa->o_mode = S_IFREG | 0600; - oa->o_id = inode->i_ino; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS | + oinfo.oi_oa->o_uid = 0; /* must have 0 uid / gid on OST */ + oinfo.oi_oa->o_gid = 0; + oinfo.oi_oa->o_mode = S_IFREG | 0600; + oinfo.oi_oa->o_id = inode->i_ino; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS | OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID; - oa->o_size = 0; + oinfo.oi_oa->o_size = 0; - obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | + obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); if (!(rec->ur_flags & MDS_OPEN_HAS_OBJS)) { @@ -391,7 +390,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, if (rec->ur_flags & MDS_OPEN_HAS_EA) { rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_osc_exp, - 0, &lsm, rec->ur_eadata); + 0, &oinfo.oi_md, rec->ur_eadata); if (rc) GOTO(out_oa, rc); } else { @@ -405,12 +404,13 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, if (rc > 0) rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_osc_exp, - 0, &lsm, lmm); + 0, &oinfo.oi_md, lmm); OBD_FREE(lmm, mds->mds_max_mdsize); if (rc) GOTO(out_oa, rc); } - rc = obd_create(mds->mds_osc_exp, oa, &lsm, &oti); + rc = obd_create(mds->mds_osc_exp, oinfo.oi_oa, + &oinfo.oi_md, &oti); if (rc) { int level = D_ERROR; if (rc == -ENOSPC) @@ -427,28 +427,30 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, } } else { rc = obd_iocontrol(OBD_IOC_LOV_SETEA, mds->mds_osc_exp, - 0, &lsm, rec->ur_eadata); + 0, &oinfo.oi_md, rec->ur_eadata); if (rc) { GOTO(out_oa, rc); } - lsm->lsm_object_id = oa->o_id; + oinfo.oi_md->lsm_object_id = oinfo.oi_oa->o_id; } if (inode->i_size) { - oa->o_size = inode->i_size; - obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLSIZE); + oinfo.oi_oa->o_size = inode->i_size; + obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLTYPE | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE); /* pack lustre id to OST */ - oa->o_fid = body->fid1.id; - oa->o_generation = body->fid1.generation; - oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; + oinfo.oi_oa->o_fid = body->fid1.id; + oinfo.oi_oa->o_generation = body->fid1.generation; + oinfo.oi_oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; - rc = obd_setattr(mds->mds_osc_exp, oa, lsm, &oti); + rc = obd_setattr_rqset(mds->mds_osc_exp, &oinfo, &oti); if (rc) { CERROR("error setting attrs for inode %lu: rc %d\n", inode->i_ino, rc); if (rc > 0) { - CERROR("obd_setattr returned bad rc %d\n", rc); + CERROR("obd_setattr_async returned bad rc %d\n", + rc); rc = -EIO; } GOTO(out_oa, rc); @@ -456,11 +458,11 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, } body->valid |= OBD_MD_FLBLKSZ | OBD_MD_FLEASIZE; - obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ); + obdo_refresh_inode(inode, oinfo.oi_oa, OBD_MD_FLBLKSZ); - LASSERT(lsm && lsm->lsm_object_id); + LASSERT(oinfo.oi_md && oinfo.oi_md->lsm_object_id); lmm = NULL; - rc = obd_packmd(mds->mds_osc_exp, &lmm, lsm); + rc = obd_packmd(mds->mds_osc_exp, &lmm, oinfo.oi_md); if (rc < 0) { CERROR("cannot pack lsm, err = %d\n", rc); GOTO(out_oa, rc); @@ -483,14 +485,14 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, obd_free_diskmd(mds->mds_osc_exp, &lmm); out_oa: oti_free_cookies(&oti); - obdo_free(oa); + obdo_free(oinfo.oi_oa); out_ids: if (rc) { OBD_FREE(*ids, mds->mds_lov_desc.ld_tgt_count * sizeof(**ids)); *ids = NULL; } - if (lsm) - obd_free_memmd(mds->mds_osc_exp, &lsm); + if (oinfo.oi_md) + obd_free_memmd(mds->mds_osc_exp, &oinfo.oi_md); RETURN(rc); } @@ -611,11 +613,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, * Now that exp_outstanding_reply is a list, it's just using mfd != NULL * to detect a re-open */ if (mfd == NULL) { - if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { - rc = mds_join_file(rec, req, dchild, NULL); - if (rc) - GOTO(out_dput, rc); - } + if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { + rc = mds_join_file(rec, req, dchild, NULL); + if (rc) + GOTO(out_dput, rc); + } mntget(mds->mds_vfsmnt); CERROR("Re-opened file \n"); mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, @@ -841,12 +843,12 @@ int mds_lock_new_child(struct obd_device *obd, struct inode *inode, if (child_lockh == NULL) child_lockh = &lockh; - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, child_res_id, - LDLM_PLAIN, NULL, LCK_EX, &lock_flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, child_lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, child_res_id, + LDLM_PLAIN, NULL, LCK_EX, &lock_flags, + ldlm_blocking_ast, ldlm_completion_ast, + NULL, NULL, 0, NULL, child_lockh); if (rc != ELDLM_OK) - CERROR("ldlm_cli_enqueue: %d\n", rc); + CERROR("ldlm_cli_enqueue_local: %d\n", rc); else if (child_lockh == &lockh) ldlm_lock_decref(child_lockh, LCK_EX); @@ -1136,11 +1138,11 @@ found_child: child_res_id.name[0] = dchild->d_inode->i_ino; child_res_id.name[1] = dchild->d_inode->i_generation; - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, - child_res_id, LDLM_IBITS, &policy, - child_mode, &lock_flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, child_lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, child_res_id, + LDLM_IBITS, &policy, child_mode, + &lock_flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, NULL, + 0, NULL, child_lockh); if (rc != ELDLM_OK) GOTO(cleanup, rc); diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 8b4b3c9..4347ba7 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -416,9 +416,8 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode, struct llog_cookie *logcookies, struct ll_fid *fid) { struct mds_obd *mds = &obd->u.mds; - struct lov_stripe_md *lsm = NULL; struct obd_trans_info oti = { 0 }; - struct obdo *oa = NULL; + struct obd_info oinfo = { { { 0 } } }; int rc; ENTRY; @@ -426,48 +425,48 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode, RETURN(0); /* first get memory EA */ - oa = obdo_alloc(); - if (!oa) + oinfo.oi_oa = obdo_alloc(); + if (!oinfo.oi_oa) RETURN(-ENOMEM); LASSERT(lmm); - rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size); + rc = obd_unpackmd(mds->mds_osc_exp, &oinfo.oi_md, lmm, lmm_size); if (rc < 0) { CERROR("Error unpack md %p for inode %lu\n", lmm, inode->i_ino); GOTO(out, rc); } - rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); + rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, oinfo.oi_md); if (rc) { - CERROR("Error revalidate lsm %p \n", lsm); + CERROR("Error revalidate lsm %p \n", oinfo.oi_md); GOTO(out, rc); } /* then fill oa */ - oa->o_id = lsm->lsm_object_id; - oa->o_uid = inode->i_uid; - oa->o_gid = inode->i_gid; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID; + oinfo.oi_oa->o_id = oinfo.oi_md->lsm_object_id; + oinfo.oi_oa->o_uid = inode->i_uid; + oinfo.oi_oa->o_gid = inode->i_gid; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID; if (logcookies) { - oa->o_valid |= OBD_MD_FLCOOKIE; + oinfo.oi_oa->o_valid |= OBD_MD_FLCOOKIE; oti.oti_logcookies = logcookies; } LASSERT(fid != NULL); - oa->o_fid = fid->id; - oa->o_generation = fid->generation; - oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; + oinfo.oi_oa->o_fid = fid->id; + oinfo.oi_oa->o_generation = fid->generation; + oinfo.oi_oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; - /* do setattr from mds to ost asynchronously */ - rc = obd_setattr_async(mds->mds_osc_exp, oa, lsm, &oti); + /* do async setattr from mds to ost not waiting for responses. */ + rc = obd_setattr_async(mds->mds_osc_exp, &oinfo, &oti, NULL); if (rc) CDEBUG(D_INODE, "mds to ost setattr objid 0x"LPX64 - " on ost error %d\n", lsm->lsm_object_id, rc); + " on ost error %d\n", oinfo.oi_md->lsm_object_id, rc); out: - if (lsm) - obd_free_memmd(mds->mds_osc_exp, &lsm); - obdo_free(oa); + if (oinfo.oi_md) + obd_free_memmd(mds->mds_osc_exp, &oinfo.oi_md); + obdo_free(oinfo.oi_oa); RETURN(rc); } @@ -1041,10 +1040,11 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id, res_id[0]->name[0], res_id[1]->name[0]); flags = LDLM_FL_LOCAL_ONLY; - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, *res_id[0], - LDLM_IBITS, policies[0], lock_modes[0], &flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, handles[0]); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, *res_id[0], + LDLM_IBITS, policies[0], lock_modes[0], + &flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, NULL, 0, + NULL, handles[0]); if (rc != ELDLM_OK) RETURN(-EIO); ldlm_lock_dump_handle(D_OTHER, handles[0]); @@ -1055,11 +1055,12 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id, ldlm_lock_addref(handles[1], lock_modes[1]); } else if (res_id[1]->name[0] != 0) { flags = LDLM_FL_LOCAL_ONLY; - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, - *res_id[1], LDLM_IBITS, policies[1], - lock_modes[1], &flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, handles[1]); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, *res_id[1], + LDLM_IBITS, policies[1], + lock_modes[1], &flags, + ldlm_blocking_ast, + ldlm_completion_ast, NULL, NULL, + 0, NULL, handles[1]); if (rc != ELDLM_OK) { ldlm_lock_decref(handles[0], lock_modes[0]); RETURN(-EIO); @@ -1164,13 +1165,13 @@ int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id, if (i < 3) try_to_aggregate_locks(res_id[i], policies[i], res_id[i+1], policies[i+1]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, - *res_id[i], LDLM_IBITS, - policies[i], - lock_modes[i], &flags, - ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, - NULL, 0, NULL, dlm_handles[i]); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, + *res_id[i], LDLM_IBITS, + policies[i], lock_modes[i], + &flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, + NULL, 0, NULL, + dlm_handles[i]); if (rc != ELDLM_OK) GOTO(out_err, rc = -EIO); ldlm_lock_dump_handle(D_OTHER, dlm_handles[i]); @@ -1255,11 +1256,12 @@ static int mds_verify_child(struct obd_device *obd, GOTO(cleanup, rc = 1); } - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, - *child_res_id, LDLM_IBITS, child_policy, - child_mode, &flags, ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, NULL, 0, - NULL, child_lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, *child_res_id, + LDLM_IBITS, child_policy, + child_mode, &flags, + ldlm_blocking_ast, + ldlm_completion_ast, NULL, + NULL, 0, NULL, child_lockh); if (rc != ELDLM_OK) GOTO(cleanup, rc = -EIO); } else { diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index d3d922c..1dbd030 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -542,7 +542,6 @@ static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, struct lustre_handle *lockh) { struct config_llog_data *cld = (struct config_llog_data *)data; - struct obd_device *obd = class_exp2obd(exp); int rc; ENTRY; @@ -556,10 +555,10 @@ static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, /* We need a callback for every lockholder, so don't try to ldlm_lock_match (see rev 1.1.2.11.2.47) */ - rc = ldlm_cli_enqueue(exp, NULL, obd->obd_namespace, cld->cld_resid, + rc = ldlm_cli_enqueue(exp, NULL, cld->cld_resid, type, NULL, mode, flags, mgc_blocking_ast, ldlm_completion_ast, NULL, - data, NULL, 0, NULL, lockh); + data, NULL, 0, NULL, lockh, 0); RETURN(rc); } diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index f00a7d8..3575d0f 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -296,10 +296,11 @@ static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname, rc = mgc_logname2resid(fsname, &res_id); if (!rc) - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id, - LDLM_PLAIN, NULL, LCK_EX, &flags, - ldlm_blocking_ast, ldlm_completion_ast, - NULL, fsname, NULL, 0, NULL, lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, + LDLM_PLAIN, NULL, LCK_EX, + &flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, + fsname, 0, NULL, lockh); if (rc) CERROR("can't take cfg lock for %s (%d)\n", fsname, rc); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 5279bcd..bd36253 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -36,9 +36,9 @@ extern struct list_head obd_types; spinlock_t obd_types_lock; -cfs_mem_cache_t *obdo_cachep = NULL; +cfs_mem_cache_t *obdo_cachep; EXPORT_SYMBOL(obdo_cachep); -cfs_mem_cache_t *import_cachep = NULL; +cfs_mem_cache_t *import_cachep; int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index b3f0e1f..8686178 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -668,6 +668,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async); LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd); @@ -750,7 +751,13 @@ void lprocfs_free_obd_stats(struct obd_device *obd) int lprocfs_write_helper(const char *buffer, unsigned long count, int *val) { - char kernbuf[20], *end; + return lprocfs_write_frac_helper(buffer, count, val, 1); +} + +int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult) +{ + char kernbuf[20], *end, *pbuf; if (count > (sizeof(kernbuf) - 1)) return -EINVAL; @@ -759,33 +766,131 @@ int lprocfs_write_helper(const char *buffer, unsigned long count, return -EFAULT; kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } - *val = simple_strtol(kernbuf, &end, 0); - if (kernbuf == end) + *val = (int)simple_strtoul(pbuf, &end, 10) * mult; + if (pbuf == end) return -EINVAL; + if (end != NULL && *end == '.') { + int temp_val, pow = 1; + int i; + + pbuf = end + 1; + if (strlen(pbuf) > 5) + pbuf[5] = '\0'; /*only allow 5bits fractional*/ + + temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult; + + if (pbuf < end) { + for (i = 0; i < (end - pbuf); i++) + pow *= 10; + + *val += temp_val / pow; + } + } return 0; } +int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mult) +{ + long decimal_val,frac_val; + int prtn; + + if (count < 10) + return -EINVAL; + + decimal_val =val / mult; + prtn = snprintf(buffer, count, "%ld", decimal_val); + frac_val = val % mult; + + if (prtn < (count - 4) && frac_val > 0) { + long temp_frac; + int i, temp_mult = 1, frac_bits = 0; + + temp_frac = frac_val * 10; + buffer[prtn++] = '.'; + while (frac_bits < 2 && (temp_frac / mult) < 1 ) { /*only reserved 2bits fraction*/ + buffer[prtn++] ='0'; + temp_frac *= 10; + frac_bits++; + } + /* + Need to think these cases : + 1. #echo x.00 > /proc/xxx output result : x + 2. #echo x.0x > /proc/xxx output result : x.0x + 3. #echo x.x0 > /proc/xxx output result : x.x + 4. #echo x.xx > /proc/xxx output result : x.xx + Only reserved 2bits fraction. + */ + for (i = 0; i < (5 - prtn); i++) + temp_mult *= 10; + + frac_bits = min((int)count - prtn, 3 - frac_bits); + prtn += snprintf(buffer + prtn, frac_bits, "%ld", frac_val * temp_mult / mult); + + prtn--; + while(buffer[prtn] < '1' || buffer[prtn] > '9') { + prtn--; + if (buffer[prtn] == '.') { + prtn--; + break; + } + } + prtn++; + } + buffer[prtn++] ='\n'; + return prtn; +} + int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val) { - char kernbuf[22], *end; + return lprocfs_write_frac_u64_helper(buffer, count, val, 1); +} - if (count > (sizeof(kernbuf) - 1)) +int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, + __u64 *val, int mult) +{ + char kernbuf[22], *end, *pbuf; + + if (count > (sizeof(kernbuf) - 1) ) return -EINVAL; if (copy_from_user(kernbuf, buffer, count)) return -EFAULT; kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } - if (kernbuf[0] == '-') - *val = -simple_strtoull(kernbuf + 1, &end, 0); - else - *val = simple_strtoull(kernbuf, &end, 0); - if (kernbuf == end) + *val = simple_strtoull(pbuf, &end, 10) * mult; + if (pbuf == end) return -EINVAL; + if (end != NULL && *end == '.') { + int temp_val; + int i, pow = 1; + + pbuf = end + 1; + if (strlen(pbuf) > 10) + pbuf[10] = '\0'; + + temp_val = (int)simple_strtoull(pbuf, &end, 10) * mult; + + if (pbuf < end) { + for (i = 0; i < (end - pbuf); i++) + pow *= 10; + + *val += (__u64)(temp_val / pow); + } + } return 0; } @@ -950,5 +1055,8 @@ EXPORT_SYMBOL(lprocfs_rd_filestotal); EXPORT_SYMBOL(lprocfs_rd_filesfree); EXPORT_SYMBOL(lprocfs_write_helper); +EXPORT_SYMBOL(lprocfs_write_frac_helper); +EXPORT_SYMBOL(lprocfs_read_frac_helper); EXPORT_SYMBOL(lprocfs_write_u64_helper); +EXPORT_SYMBOL(lprocfs_write_frac_u64_helper); #endif /* LPROCFS*/ diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 5dcfd30..84dee3a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -663,8 +663,6 @@ int class_process_config(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", obd_timeout, lcfg->lcfg_num); obd_timeout = max(lcfg->lcfg_num, 1U); - if (ldlm_timeout >= obd_timeout) - ldlm_timeout = max(obd_timeout / 3, 1U); GOTO(out, err = 0); } case LCFG_SET_UPCALL: { diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index e55d130..a4367a8 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -155,11 +155,10 @@ int echo_destroy(struct obd_export *exp, struct obdo *oa, RETURN(0); } -static int echo_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md) +static int echo_getattr(struct obd_export *exp, struct obd_info *oinfo) { struct obd_device *obd = class_exp2obd(exp); - obd_id id = oa->o_id; + obd_id id = oinfo->oi_oa->o_id; ENTRY; if (!obd) { @@ -168,19 +167,20 @@ static int echo_getattr(struct obd_export *exp, struct obdo *oa, RETURN(-EINVAL); } - if (!(oa->o_valid & OBD_MD_FLID)) { - CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid); + if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) { + CERROR("obdo missing FLID valid flag: "LPX64"\n", + oinfo->oi_oa->o_valid); RETURN(-EINVAL); } - obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid); - oa->o_id = id; + obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid); + oinfo->oi_oa->o_id = id; RETURN(0); } -static int echo_setattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti) +static int echo_setattr(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti) { struct obd_device *obd = class_exp2obd(exp); @@ -191,14 +191,15 @@ static int echo_setattr(struct obd_export *exp, struct obdo *oa, RETURN(-EINVAL); } - if (!(oa->o_valid & OBD_MD_FLID)) { - CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid); + if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) { + CERROR("obdo missing FLID valid flag: "LPX64"\n", + oinfo->oi_oa->o_valid); RETURN(-EINVAL); } - memcpy(&obd->u.echo.eo_oa, oa, sizeof(*oa)); + memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); - if (oa->o_id & 4) { + if (oinfo->oi_oa->o_id & 4) { /* Save lock to force ACKed reply */ ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL); oti->oti_ack_locks[0].mode = LCK_NL; @@ -472,10 +473,10 @@ static int echo_setup(struct obd_device *obd, obd_count len, void *buf) RETURN(-ENOMEM); } - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id, - LDLM_PLAIN, NULL, LCK_NL, &lock_flags, - NULL, ldlm_completion_ast, NULL, NULL, - NULL, 0, NULL, &obd->u.echo.eo_nl_lock); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, LDLM_PLAIN, + NULL, LCK_NL, &lock_flags, NULL, + ldlm_completion_ast, NULL, NULL, + 0, NULL, &obd->u.echo.eo_nl_lock); LASSERT (rc == ELDLM_OK); lprocfs_init_vars(echo, &lvars); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 6e9073e..159a459 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -494,6 +494,7 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, obd_size count, struct obd_trans_info *oti) { struct echo_client_obd *ec = &obd->u.echo_client; + struct obd_info oinfo = { { { 0 } } }; obd_count npages; struct brw_page *pga; struct brw_page *pgp; @@ -540,11 +541,13 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, pgp->flag = 0; if (verify) - echo_client_page_debug_setup(lsm, pgp->pg, rw, + echo_client_page_debug_setup(lsm, pgp->pg, rw, oa->o_id, off, pgp->count); } - rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti); + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti); out: if (rc != 0 || rw != OBD_BRW_READ) @@ -575,6 +578,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, struct obd_trans_info *oti) { struct echo_client_obd *ec = &obd->u.echo_client; + struct obd_info oinfo = { { { 0 } } }; obd_count npages; struct brw_page *pga; struct brw_page *pgp; @@ -583,8 +587,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, int i; int rc; - LASSERT (rw == OBD_BRW_WRITE || - rw == OBD_BRW_READ); + LASSERT (rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); /* NB: for now, only whole pages, page aligned */ @@ -622,7 +625,9 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, pgp->flag = 0; } - rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti); + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti); // if (rw == OBD_BRW_READ) // mark_dirty_kiobuf (kiobuf, count); @@ -1085,9 +1090,10 @@ echo_client_enqueue(struct obd_export *exp, struct obdo *oa, struct obd_device *obd = exp->exp_obd; struct echo_client_obd *ec = &obd->u.echo_client; struct lustre_handle *ulh = obdo_handle (oa); + struct obd_enqueue_info einfo = { 0 }; + struct obd_info oinfo = { { { 0 } } }; struct ec_object *eco; struct ec_lock *ecl; - int flags; int rc; if (!(mode == LCK_PR || mode == LCK_PW)) @@ -1112,11 +1118,17 @@ echo_client_enqueue(struct obd_export *exp, struct obdo *oa, ecl->ecl_policy.l_extent.end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1); - flags = 0; - rc = obd_enqueue(ec->ec_exp, eco->eco_lsm, LDLM_EXTENT, - &ecl->ecl_policy, mode, &flags, echo_ldlm_callback, - ldlm_completion_ast, NULL, eco, sizeof(struct ost_lvb), - lustre_swab_ost_lvb, &ecl->ecl_lock_handle); + einfo.ei_type = LDLM_EXTENT; + einfo.ei_mode = mode; + einfo.ei_cb_bl = echo_ldlm_callback; + einfo.ei_cb_cp = ldlm_completion_ast; + einfo.ei_cb_gl = NULL; + einfo.ei_cbdata = eco; + + oinfo.oi_policy = ecl->ecl_policy; + oinfo.oi_lockh = &ecl->ecl_lock_handle; + oinfo.oi_md = eco->eco_lsm; + rc = obd_enqueue(ec->ec_exp, &oinfo, &einfo); if (rc != 0) goto failed_1; @@ -1232,8 +1244,10 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, case OBD_IOC_GETATTR: rc = echo_get_object (&eco, obd, &data->ioc_obdo1); if (rc == 0) { - rc = obd_getattr(ec->ec_exp, &data->ioc_obdo1, - eco->eco_lsm); + struct obd_info oinfo = { { { 0 } } }; + oinfo.oi_md = eco->eco_lsm; + oinfo.oi_oa = &data->ioc_obdo1; + rc = obd_getattr(ec->ec_exp, &oinfo); echo_put_object(eco); } GOTO(out, rc); @@ -1244,8 +1258,11 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, rc = echo_get_object (&eco, obd, &data->ioc_obdo1); if (rc == 0) { - rc = obd_setattr(ec->ec_exp, &data->ioc_obdo1, - eco->eco_lsm, NULL); + struct obd_info oinfo = { { { 0 } } }; + oinfo.oi_oa = &data->ioc_obdo1; + oinfo.oi_md = eco->eco_lsm; + + rc = obd_setattr(ec->ec_exp, &oinfo, NULL); echo_put_object(eco); } GOTO(out, rc); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 8a6e6ba..ad30426 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -514,7 +514,6 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) rc = filter_client_add(obd, filter, fed, cl_idx); LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */ - fcd = NULL; exp->exp_replay_needed = 1; exp->exp_connecting = 0; @@ -527,7 +526,6 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) if (last_rcvd > le64_to_cpu(fsd->lsd_last_transno)) fsd->lsd_last_transno = cpu_to_le64(last_rcvd); - } if (fcd) @@ -952,7 +950,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group, return dparent; rc = filter_lock_dentry(obd, dparent); - fsfilt_check_slow(now, obd_timeout, "parent lock"); + fsfilt_check_slow(obd, now, obd_timeout, "parent lock"); return rc ? ERR_PTR(rc) : dparent; } @@ -1033,10 +1031,10 @@ static int filter_prepare_destroy(struct obd_device *obd, obd_id objid) ENTRY; /* Tell the clients that the object is gone now and that they should * throw away any cached pages. */ - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id, - LDLM_EXTENT, &policy, LCK_PW, - &flags, ldlm_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, &lockh); + rc = ldlm_cli_enqueue_local(obd->obd_namespace, res_id, LDLM_EXTENT, + &policy, LCK_PW, &flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, NULL, 0, NULL, + &lockh); /* We only care about the side-effects, just drop the lock. */ if (rc == ELDLM_OK) @@ -1752,9 +1750,9 @@ static int filter_cleanup(struct obd_device *obd) static int filter_connect_internal(struct obd_export *exp, struct obd_connect_data *data) { - if (!data) + if (!data) RETURN(0); - + CDEBUG(D_RPCTRACE, "%s: cli %s/%p ocd_connect_flags: "LPX64 " ocd_version: %x ocd_grant: %d\n", exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, @@ -2076,8 +2074,7 @@ struct dentry *__filter_oa2dentry(struct obd_device *obd, struct obdo *oa, return dchild; } -static int filter_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md) +static int filter_getattr(struct obd_export *exp, struct obd_info *oinfo) { struct dentry *dentry = NULL; struct obd_device *obd; @@ -2090,13 +2087,13 @@ static int filter_getattr(struct obd_export *exp, struct obdo *oa, RETURN(-EINVAL); } - dentry = filter_oa2dentry(obd, oa); + dentry = filter_oa2dentry(obd, oinfo->oi_oa); if (IS_ERR(dentry)) RETURN(PTR_ERR(dentry)); /* Limit the valid bits in the return data to what we actually use */ - oa->o_valid = OBD_MD_FLID; - obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS); + oinfo->oi_oa->o_valid = OBD_MD_FLID; + obdo_from_inode(oinfo->oi_oa, dentry->d_inode, FILTER_VALID_FLAGS); f_dput(dentry); RETURN(rc); @@ -2260,17 +2257,17 @@ out_unlock: unsigned int cur_ids[MAXQUOTAS] = {oa->o_uid, oa->o_gid}; int rc2 = lquota_adjust(quota_interface, exp->exp_obd, cur_ids, orig_ids, rc, FSFILT_OP_SETATTR); - CDEBUG(rc2 ? D_ERROR : D_QUOTA, + CDEBUG(rc2 ? D_ERROR : D_QUOTA, "filter adjust qunit. (rc:%d)\n", rc2); } return rc; } /* this is called from filter_truncate() until we have filter_punch() */ -int filter_setattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti) +int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti) { - struct ldlm_res_id res_id = { .name = { oa->o_id } }; + struct ldlm_res_id res_id = { .name = { oinfo->oi_oa->o_id } }; struct ldlm_valblock_ops *ns_lvbo; struct lvfs_run_ctxt saved; struct filter_obd *filter; @@ -2279,7 +2276,8 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa, int rc; ENTRY; - dentry = __filter_oa2dentry(exp->exp_obd, oa, __FUNCTION__, 1); + dentry = __filter_oa2dentry(exp->exp_obd, oinfo->oi_oa, + __FUNCTION__, 1); if (IS_ERR(dentry)) RETURN(PTR_ERR(dentry)); @@ -2288,7 +2286,7 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa, lock_kernel(); /* setting objects attributes (including owner/group) */ - rc = filter_setattr_internal(exp, dentry, oa, oti); + rc = filter_setattr_internal(exp, dentry, oinfo->oi_oa, oti); if (rc) GOTO(out_unlock, rc); @@ -2302,10 +2300,10 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa, ldlm_resource_putref(res); } - oa->o_valid = OBD_MD_FLID; + oinfo->oi_oa->o_valid = OBD_MD_FLID; /* Quota release need uid/gid info */ - obdo_from_inode(oa, dentry->d_inode, + obdo_from_inode(oinfo->oi_oa, dentry->d_inode, FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID); EXIT; @@ -2487,7 +2485,7 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs, /* set EROFS to state field if FS is mounted as RDONLY. The goal is to * stop creating files on MDS if OST is not good shape to create * objects.*/ - osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ? + osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ? EROFS : 0; RETURN(rc); } @@ -2753,8 +2751,8 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, fcc = obdo_logcookie(oa); llog_cancel(llog_get_context(obd, fcc->lgc_subsys + 1), NULL, 1, fcc, 0); + fcc = NULL; /* we didn't allocate fcc, don't free it */ } - fcc = NULL; GOTO(cleanup, rc = -ENOENT); } @@ -2850,31 +2848,32 @@ cleanup: qcids[USRQUOTA] = oa->o_uid; qcids[GRPQUOTA] = oa->o_gid; rc2 = lquota_adjust(quota_interface, obd, qcids, NULL, rc, - FSFILT_OP_UNLINK); - CDEBUG(rc2 ? D_ERROR : D_QUOTA, + FSFILT_OP_UNLINK); + CDEBUG(rc2 ? D_ERROR : D_QUOTA, "filter adjust qunit! (rc:%d)\n", rc2); return rc; } /* NB start and end are used for punch, but not truncate */ -static int filter_truncate(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_off start, - obd_off end, struct obd_trans_info *oti) +static int filter_truncate(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) { int rc; ENTRY; - if (end != OBD_OBJECT_EOF) { + if (oinfo->oi_policy.l_extent.end != OBD_OBJECT_EOF) { CERROR("PUNCH not supported, only truncate: end = "LPX64"\n", - end); + oinfo->oi_policy.l_extent.end); RETURN(-EFAULT); } CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = "LPX64 - ", o_size = "LPD64"\n", oa->o_id, oa->o_valid, start); - - oa->o_size = start; - rc = filter_setattr(exp, oa, NULL, oti); + ", o_size = "LPD64"\n", oinfo->oi_oa->o_id, + oinfo->oi_oa->o_valid, oinfo->oi_policy.l_extent.start); + + oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start; + rc = filter_setattr(exp, oinfo, oti); RETURN(rc); } @@ -2994,7 +2993,7 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen, /* setup llog imports */ ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT); rc = llog_receptor_accept(ctxt, exp->exp_imp_reverse); - + lquota_setinfo(quota_interface, exp, obd); RETURN(rc); @@ -3188,11 +3187,11 @@ static int __init obdfilter_init(void) out: if (quota_interface) PORTAL_SYMBOL_PUT(filter_quota_interface); - + OBD_FREE(obdfilter_created_scratchpad, OBDFILTER_CREATED_SCRATCHPAD_ENTRIES * sizeof(*obdfilter_created_scratchpad)); - } + } return rc; } diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 81d9406..6361fdb 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -86,8 +86,8 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, struct obd_export *); int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, struct obdo *oa, struct obd_trans_info *oti); -int filter_setattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti); +int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti); struct dentry *filter_create_object(struct obd_device *obd, struct obdo *oa); @@ -102,9 +102,8 @@ int filter_preprw(int cmd, struct obd_export *, struct obdo *, int objcount, int filter_commitrw(int cmd, struct obd_export *, struct obdo *, int objcount, struct obd_ioobj *, int niocount, struct niobuf_local *, struct obd_trans_info *, int rc); -int filter_brw(int cmd, struct obd_export *, struct obdo *, - struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *, - struct obd_trans_info *); +int filter_brw(int cmd, struct obd_export *, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, struct obd_trans_info *); void flip_into_page_cache(struct inode *inode, struct page *new_page); /* filter_io_*.c */ diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index d1c7a2d..3bb0bb8 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -312,7 +312,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, inode = dentry->d_inode; obdo_to_inode(inode, oa, OBD_MD_FLATIME); - fsfilt_check_slow(now, obd_timeout, "preprw_read setup"); + fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup"); for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt; i++, rnb++, lnb++) { @@ -345,7 +345,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, filter_iobuf_add_page(obd, iobuf, inode, lnb->page); } - fsfilt_check_slow(now, obd_timeout, "start_page_read"); + fsfilt_check_slow(obd, now, obd_timeout, "start_page_read"); rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); @@ -539,7 +539,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, fso.fso_dentry = dentry; fso.fso_bufcnt = obj->ioo_bufcnt; - fsfilt_check_slow(now, obd_timeout, "preprw_write setup"); + fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup"); spin_lock(&exp->exp_obd->obd_osfs_lock); if (oa) { @@ -630,7 +630,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); - fsfilt_check_slow(now, obd_timeout, "start_page_write"); + fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write"); lprocfs_counter_add(exp->exp_obd->obd_stats, LPROC_FILTER_WRITE_BYTES, tot_bytes); @@ -810,9 +810,9 @@ int filter_commitrw(int cmd, struct obd_export *exp, struct obdo *oa, return -EPROTO; } -int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_trans_info *oti) +int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti) { struct obd_ioobj ioo; struct niobuf_local *lnb; @@ -833,14 +833,16 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, rnb[i].len = pga[i].count; } - obdo_to_ioobj(oa, &ioo); + obdo_to_ioobj(oinfo->oi_oa, &ioo); ioo.ioo_bufcnt = oa_bufs; - ret = filter_preprw(cmd, exp, oa, 1, &ioo, oa_bufs, rnb, lnb, oti); + ret = filter_preprw(cmd, exp, oinfo->oi_oa, 1, &ioo, + oa_bufs, rnb, lnb, oti); if (ret != 0) GOTO(out, ret); - ret = filter_commitrw(cmd, exp, oa, 1, &ioo, oa_bufs, lnb, oti, ret); + ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo, + oa_bufs, lnb, oti, ret); out: if (lnb) diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 6766190..0e1a84e 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -424,7 +424,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, GOTO(cleanup, rc); } - fsfilt_check_slow(now, obd_timeout, "brw_start"); + fsfilt_check_slow(obd, now, obd_timeout, "brw_start"); i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; @@ -460,7 +460,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, if (rc == 0) obdo_from_inode(oa, inode, FILTER_VALID_FLAGS); - fsfilt_check_slow(now, obd_timeout, "direct_io"); + fsfilt_check_slow(obd, now, obd_timeout, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) { @@ -471,7 +471,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); - fsfilt_check_slow(now, obd_timeout, "commitrw commit"); + fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit"); cleanup: filter_grant_commit(exp, niocount, res); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 5c57e80..d4528fd 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -598,7 +598,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, cleanup_phase = 2; LOCK_INODE_MUTEX(inode); - fsfilt_check_slow(now, obd_timeout, "i_mutex"); + fsfilt_check_slow(obd, now, obd_timeout, "i_mutex"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); if (IS_ERR(oti->oti_handle)) { @@ -611,7 +611,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, } /* have to call fsfilt_commit() from this point on */ - fsfilt_check_slow(now, obd_timeout, "brw_start"); + fsfilt_check_slow(obd, now, obd_timeout, "brw_start"); i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; @@ -639,6 +639,13 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, iattr.ia_mode &= ~S_ISGID; rc = filter_update_fidea(exp, inode, oti->oti_handle, oa); + + /* To avoid problems with quotas, UID and GID must be set + * in the inode before filter_direct_io() - see bug 10357. */ + if (iattr.ia_valid & ATTR_UID) + inode->i_uid = iattr.ia_uid; + if (iattr.ia_valid & ATTR_GID) + inode->i_gid = iattr.ia_gid; } /* filter_direct_io drops i_mutex */ @@ -652,7 +659,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, lquota_getflag(quota_interface, obd, oa); - fsfilt_check_slow(now, obd_timeout, "direct_io"); + fsfilt_check_slow(obd, now, obd_timeout, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) { @@ -665,7 +672,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); - fsfilt_check_slow(now, obd_timeout, "commitrw commit"); + fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit"); cleanup: filter_grant_commit(exp, niocount, res); diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index c61be24..001a0ad 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -162,25 +162,25 @@ static int filter_recov_log_setattr_cb(struct llog_ctxt *ctxt, struct obd_device *obd = ctxt->loc_obd; struct obd_export *exp = obd->obd_self_export; struct llog_setattr_rec *lsr; - struct obdo *oa; + struct obd_info oinfo = { { { 0 } } }; obd_id oid; int rc = 0; ENTRY; lsr = (struct llog_setattr_rec *)rec; - oa = obdo_alloc(); - - oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID | - OBD_MD_FLCOOKIE); - oa->o_id = lsr->lsr_oid; - oa->o_gr = lsr->lsr_ogen; - oa->o_uid = lsr->lsr_uid; - oa->o_gid = lsr->lsr_gid; - memcpy(obdo_logcookie(oa), cookie, sizeof(*cookie)); - oid = oa->o_id; - - rc = filter_setattr(exp, oa, NULL, NULL); - obdo_free(oa); + oinfo.oi_oa = obdo_alloc(); + + oinfo.oi_oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID | + OBD_MD_FLCOOKIE); + oinfo.oi_oa->o_id = lsr->lsr_oid; + oinfo.oi_oa->o_gr = lsr->lsr_ogen; + oinfo.oi_oa->o_uid = lsr->lsr_uid; + oinfo.oi_oa->o_gid = lsr->lsr_gid; + memcpy(obdo_logcookie(oinfo.oi_oa), cookie, sizeof(*cookie)); + oid = oinfo.oi_oa->o_id; + + rc = filter_setattr(exp, &oinfo, NULL); + obdo_free(oinfo.oi_oa); if (rc == -ENOENT) { CDEBUG(D_HA, "object already removed, send cookie\n"); diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 764c55c..0df7dfe 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -111,13 +111,16 @@ static int osc_rd_max_dirty_mb(char *page, char **start, off_t off, int count, { struct obd_device *dev = data; struct client_obd *cli = &dev->u.cli; - unsigned val; + long val; + int mult; client_obd_list_lock(&cli->cl_loi_list_lock); - val = cli->cl_dirty_max >> 20; + val = cli->cl_dirty_max; + spin_unlock(&cli->cl_loi_list_lock); client_obd_list_unlock(&cli->cl_loi_list_lock); - return snprintf(page, count, "%u\n", val); + mult = 1 << 20; + return lprocfs_read_frac_helper(page, count, val, mult); } static int osc_wr_max_dirty_mb(struct file *file, const char *buffer, @@ -125,18 +128,19 @@ static int osc_wr_max_dirty_mb(struct file *file, const char *buffer, { struct obd_device *dev = data; struct client_obd *cli = &dev->u.cli; - int val, rc; + int pages_number, mult, rc; - rc = lprocfs_write_helper(buffer, count, &val); + mult = 1 << (20 - PAGE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); if (rc) return rc; - if (val < 0 || val > OSC_MAX_DIRTY_MB_MAX || - val > num_physpages >> (20 - PAGE_SHIFT - 2)) /* 1/4 of RAM */ + if (pages_number < 0 || pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) || + pages_number > num_physpages / 4) /* 1/4 of RAM */ return -ERANGE; client_obd_list_lock(&cli->cl_loi_list_lock); - cli->cl_dirty_max = (obd_count)val * 1024 * 1024; + cli->cl_dirty_max = (obd_count)(pages_number << PAGE_SHIFT); osc_wake_cache_waiters(cli); client_obd_list_unlock(&cli->cl_loi_list_lock); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 328cf70..50ae0bb 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -141,40 +141,40 @@ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, } static int osc_getattr_interpret(struct ptlrpc_request *req, - struct osc_getattr_async_args *aa, int rc) + struct osc_async_args *aa, int rc) { struct ost_body *body; ENTRY; if (rc != 0) - RETURN(rc); + GOTO(out, rc); body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), lustre_swab_ost_body); if (body) { CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - memcpy(aa->aa_oa, &body->oa, sizeof(*aa->aa_oa)); + memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa)); /* This should really be sent by the OST */ - aa->aa_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - aa->aa_oa->o_valid |= OBD_MD_FLBLKSZ; + aa->aa_oi->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; + aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; } else { CERROR("can't unpack ost_body\n"); rc = -EPROTO; - aa->aa_oa->o_valid = 0; + aa->aa_oi->oi_oa->o_valid = 0; } - +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); RETURN(rc); } -static int osc_getattr_async(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, +static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo, struct ptlrpc_request_set *set) { struct ptlrpc_request *req; struct ost_body *body; int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; - struct osc_getattr_async_args *aa; + struct osc_async_args *aa; ENTRY; req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION, @@ -183,21 +183,20 @@ static int osc_getattr_async(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - memcpy(&body->oa, oa, sizeof(*oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); ptlrpc_req_set_repsize(req, 2, size); req->rq_interpret_reply = osc_getattr_interpret; LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); - aa = (struct osc_getattr_async_args *)&req->rq_async_args; - aa->aa_oa = oa; + aa = (struct osc_async_args *)&req->rq_async_args; + aa->aa_oi = oinfo; ptlrpc_set_add_req(set, req); RETURN (0); } -static int osc_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md) +static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo) { struct ptlrpc_request *req; struct ost_body *body; @@ -210,7 +209,7 @@ static int osc_getattr(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - memcpy(&body->oa, oa, sizeof(*oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); ptlrpc_req_set_repsize(req, 2, size); @@ -228,11 +227,11 @@ static int osc_getattr(struct obd_export *exp, struct obdo *oa, } CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - memcpy(oa, &body->oa, sizeof(*oa)); + memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa)); /* This should really be sent by the OST */ - oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - oa->o_valid |= OBD_MD_FLBLKSZ; + oinfo->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; + oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; EXIT; out: @@ -240,8 +239,8 @@ static int osc_getattr(struct obd_export *exp, struct obdo *oa, return rc; } -static int osc_setattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti) +static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti) { struct ptlrpc_request *req; struct ost_body *body; @@ -254,7 +253,7 @@ static int osc_setattr(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - memcpy(&body->oa, oa, sizeof(*oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); ptlrpc_req_set_repsize(req, 2, size); @@ -267,24 +266,45 @@ static int osc_setattr(struct obd_export *exp, struct obdo *oa, if (body == NULL) GOTO(out, rc = -EPROTO); - memcpy(oa, &body->oa, sizeof(*oa)); + memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa)); EXIT; out: ptlrpc_req_finished(req); - RETURN(0); + RETURN(rc); } -static int osc_setattr_async(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, - struct obd_trans_info *oti) +static int osc_setattr_interpret(struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) { - struct ptlrpc_request *req; struct ost_body *body; - int rc = 0, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; ENTRY; - LASSERT(oti); + if (rc != 0) + GOTO(out, rc); + + body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR("can't unpack ost_body\n"); + GOTO(out, rc = -EPROTO); + } + + memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa)); +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + struct osc_async_args *aa; + ENTRY; req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION, OST_SETATTR, 2, size, NULL); @@ -293,16 +313,29 @@ static int osc_setattr_async(struct obd_export *exp, struct obdo *oa, body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - if (oa->o_valid & OBD_MD_FLCOOKIE) - memcpy(obdo_logcookie(oa), oti->oti_logcookies, + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + memcpy(obdo_logcookie(oinfo->oi_oa), oti->oti_logcookies, sizeof(*oti->oti_logcookies)); + } - memcpy(&body->oa, oa, sizeof(*oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); ptlrpc_req_set_repsize(req, 2, size); /* do mds to ost setattr asynchronouly */ - ptlrpcd_add_req(req); + if (!rqset) { + /* Do not wait for response. */ + ptlrpcd_add_req(req); + } else { + req->rq_interpret_reply = osc_setattr_interpret; - RETURN(rc); + LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); + aa = (struct osc_async_args *)&req->rq_async_args; + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + } + + RETURN(0); } int osc_real_create(struct obd_export *exp, struct obdo *oa, @@ -388,16 +421,39 @@ out: return rc; } -static int osc_punch(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, obd_size start, - obd_size end, struct obd_trans_info *oti) +static int osc_punch_interpret(struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + GOTO(out, rc = -EPROTO); + } + + memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa)); +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) { struct ptlrpc_request *req; + struct osc_async_args *aa; struct ost_body *body; - int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; ENTRY; - if (!oa) { + if (!oinfo->oi_oa) { CERROR("oa NULL\n"); RETURN(-EINVAL); } @@ -408,32 +464,22 @@ static int osc_punch(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - memcpy(&body->oa, oa, sizeof(*oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); /* overload the size and blocks fields in the oa with start/end */ - body->oa.o_size = start; - body->oa.o_blocks = end; + body->oa.o_size = oinfo->oi_policy.l_extent.start; + body->oa.o_blocks = oinfo->oi_policy.l_extent.end; body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); ptlrpc_req_set_repsize(req, 2, size); - rc = ptlrpc_queue_wait(req); - if (rc) - GOTO(out, rc); - - body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), - lustre_swab_ost_body); - if (body == NULL) { - CERROR ("can't unpack ost_body\n"); - GOTO (out, rc = -EPROTO); - } - - memcpy(oa, &body->oa, sizeof(*oa)); + req->rq_interpret_reply = osc_punch_interpret; + LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); + aa = (struct osc_async_args *)&req->rq_async_args; + aa->aa_oi = oinfo; + ptlrpc_set_add_req(rqset, req); - EXIT; - out: - ptlrpc_req_finished(req); - return rc; + RETURN(0); } static int osc_sync(struct obd_export *exp, struct obdo *oa, @@ -483,13 +529,23 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa, return rc; } +/* Destroy requests can be async always on the client, and we don't even really + * care about the return code since the client cannot do anything at all about + * a destroy failure. + * When the MDS is unlinking a filename, it saves the file objects into a + * recovery llog, and these object records are cancelled when the OST reports + * they were destroyed and sync'd to disk (i.e. transaction committed). + * If the client dies, or the OST is down when the object should be destroyed, + * the records are not cancelled, and when the OST reconnects to the MDS next, + * it will retrieve the llog unlink logs and then sends the log cancellation + * cookies to the MDS after committing destroy transactions. */ static int osc_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, struct obd_export *md_export) { struct ptlrpc_request *req; struct ost_body *body; - int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; ENTRY; if (!oa) { @@ -507,31 +563,13 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) { memcpy(obdo_logcookie(oa), oti->oti_logcookies, sizeof(*oti->oti_logcookies)); - oti->oti_logcookies++; } memcpy(&body->oa, oa, sizeof(*oa)); ptlrpc_req_set_repsize(req, 2, size); - rc = ptlrpc_queue_wait(req); - if (rc == -ENOENT) - rc = 0; - if (rc) - GOTO(out, rc); - - body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), - lustre_swab_ost_body); - if (body == NULL) { - CERROR ("Can't unpack body\n"); - GOTO (out, rc = -EPROTO); - } - - memcpy(oa, &body->oa, sizeof(*oa)); - - EXIT; - out: - ptlrpc_req_finished(req); - return rc; + ptlrpcd_add_req(req); + RETURN(0); } static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, @@ -1087,6 +1125,17 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, int rc; ENTRY; + /* Consume write credits even if doing a sync write - + * otherwise we may run out of space on OST due to grant. */ + spin_lock(&exp->exp_obd->u.cli.cl_loi_list_lock); + for (nio_count = 0; nio_count < page_count; nio_count++) { + if (exp->exp_obd->u.cli.cl_avail_grant >= PAGE_SIZE) { + exp->exp_obd->u.cli.cl_avail_grant -= PAGE_SIZE; + pga[nio_count]->flag |= OBD_BRW_FROM_GRANT; + } + } + spin_unlock(&exp->exp_obd->u.cli.cl_loi_list_lock); + rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm, page_count, pga, &requested_nob, &nio_count, &req); @@ -1183,9 +1232,9 @@ static void osc_release_ppga(struct brw_page **ppga, obd_count count) OBD_FREE(ppga, sizeof(*ppga) * count); } -static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, obd_count page_count, - struct brw_page *pga, struct obd_trans_info *oti) +static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, + obd_count page_count, struct brw_page *pga, + struct obd_trans_info *oti) { struct obdo *saved_oa = NULL; struct brw_page **ppga, **orig; @@ -1209,6 +1258,7 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); page_count_orig = page_count; + sort_brw_pages(ppga, page_count); while (page_count) { obd_count pages_per_brw; @@ -1217,21 +1267,21 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, else pages_per_brw = page_count; - sort_brw_pages(ppga, pages_per_brw); pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw); if (saved_oa != NULL) { /* restore previously saved oa */ - *oa = *saved_oa; + *oinfo->oi_oa = *saved_oa; } else if (page_count > pages_per_brw) { /* save a copy of oa (brw will clobber it) */ saved_oa = obdo_alloc(); if (saved_oa == NULL) GOTO(out, rc = -ENOMEM); - *saved_oa = *oa; + *saved_oa = *oinfo->oi_oa; } - rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, ppga); + rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md, + pages_per_brw, ppga); if (rc != 0) break; @@ -1249,10 +1299,10 @@ out: RETURN(rc); } -static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *md, obd_count page_count, - struct brw_page *pga, struct ptlrpc_request_set *set, - struct obd_trans_info *oti) +static int osc_brw_async(int cmd, struct obd_export *exp, + struct obd_info *oinfo, obd_count page_count, + struct brw_page *pga, struct obd_trans_info *oti, + struct ptlrpc_request_set *set) { struct brw_page **ppga, **orig; int page_count_orig; @@ -1274,6 +1324,7 @@ static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); page_count_orig = page_count; + sort_brw_pages(ppga, page_count); while (page_count) { obd_count pages_per_brw; @@ -1282,10 +1333,10 @@ static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, else pages_per_brw = page_count; - sort_brw_pages(ppga, pages_per_brw); pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw); - rc = async_internal(cmd, exp, oa, md, pages_per_brw, ppga, set); + rc = async_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md, + pages_per_brw, ppga, set); if (rc != 0) break; @@ -2379,9 +2430,8 @@ out: /* Note: caller will lock/unlock, and set uptodate on the pages */ #if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga) +static int sanosc_brw_read(struct obd_export *exp, struct obd_info *oinfo, + obd_count page_count, struct brw_page *pga) { struct ptlrpc_request *req = NULL; struct ost_body *body; @@ -2413,9 +2463,9 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, nioptr = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, sizeof(*nioptr) * page_count); - memcpy(&body->oa, oa, sizeof(body->oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(body->oa)); - obdo_to_ioobj(oa, iooptr); + obdo_to_ioobj(oinfo->oi_oa, iooptr); iooptr->ioo_bufcnt = page_count; for (mapped = 0; mapped < page_count; mapped++, nioptr++) { @@ -2441,7 +2491,7 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, GOTO(out_req, rc = -EPROTO); } - memcpy(oa, &body->oa, sizeof(*oa)); + memcpy(oinfo->oi_oa, &body->oa, sizeof(*oinfo->oi_oa)); swab = lustre_msg_swabbed(req->rq_repmsg); LASSERT_REPSWAB(req, REPLY_REC_OFF + 1); @@ -2517,9 +2567,8 @@ out_req: RETURN(rc); } -static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga) +static int sanosc_brw_write(struct obd_export *exp, struct obd_info *oinfo, + obd_count page_count, struct brw_page *pga) { struct ptlrpc_request *req = NULL; struct ost_body *body; @@ -2549,9 +2598,9 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, nioptr = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, sizeof(*nioptr) * page_count); - memcpy(&body->oa, oa, sizeof(body->oa)); + memcpy(&body->oa, oinfo->oi_oa, sizeof(body->oa)); - obdo_to_ioobj(oa, iooptr); + obdo_to_ioobj(oinfo->oi_oa, iooptr); iooptr->ioo_bufcnt = page_count; /* pack request */ @@ -2638,9 +2687,9 @@ out_req: RETURN(rc); } -static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga, struct obd_trans_info *oti) +static int sanosc_brw(int cmd, struct obd_export *exp, struct obd_infl *oinfo, + obd_count page_count, struct brw_page *pga, + struct obd_trans_info *oti) { ENTRY; @@ -2654,9 +2703,9 @@ static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa, pages_per_brw = page_count; if (cmd & OBD_BRW_WRITE) - rc = sanosc_brw_write(exp, oa, lsm, pages_per_brw,pga); + rc = sanosc_brw_write(exp, oinfo, pages_per_brw, pga); else - rc = sanosc_brw_read(exp, oa, lsm, pages_per_brw, pga); + rc = sanosc_brw_read(exp, oinfo, pages_per_brw, pga); if (rc != 0) RETURN(rc); @@ -2711,39 +2760,117 @@ static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, return 0; } -static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *bl_cb, void *cp_cb, void *gl_cb, - void *data, __u32 lvb_len, void *lvb_swabber, - struct lustre_handle *lockh) +static int osc_enqueue_fini(struct ptlrpc_request *req, struct obd_info *oinfo, + int intent, int rc) { - struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + ENTRY; + + if (intent) { + /* The request was created before ldlm_cli_enqueue call. */ + if (rc == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + + /* swabbed by ldlm_cli_enqueue() */ + LASSERT_REPSWABBED(req, DLM_LOCKREPLY_OFF); + rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, + sizeof(*rep)); + LASSERT(rep != NULL); + if (rep->lock_policy_res1) + rc = rep->lock_policy_res1; + } + } + + if ((intent && rc == ELDLM_LOCK_ABORTED) || !rc) { + CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n", + oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_size, + oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_blocks, + oinfo->oi_md->lsm_oinfo->loi_lvb.lvb_mtime); + } + + /* Call the update callback. */ + rc = oinfo->oi_cb_up(oinfo, rc); + RETURN(rc); +} + +static int osc_enqueue_interpret(struct ptlrpc_request *req, + struct osc_enqueue_args *aa, int rc) +{ + int intent = aa->oa_ei->ei_flags & LDLM_FL_HAS_INTENT; + struct lov_stripe_md *lsm = aa->oa_oi->oi_md; + struct ldlm_lock *lock; + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(aa->oa_oi->oi_lockh); + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1, + aa->oa_ei->ei_mode, + &aa->oa_ei->ei_flags, + &lsm->lsm_oinfo->loi_lvb, + sizeof(lsm->lsm_oinfo->loi_lvb), + lustre_swab_ost_lvb, + aa->oa_oi->oi_lockh, rc); + + /* Complete osc stuff. */ + rc = osc_enqueue_fini(req, aa->oa_oi, intent, rc); + + /* Release the lock for async request. */ + if (lustre_handle_is_used(aa->oa_oi->oi_lockh) && rc == ELDLM_OK) + ldlm_lock_decref(aa->oa_oi->oi_lockh, aa->oa_ei->ei_mode); + + LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n", + aa->oa_oi->oi_lockh, req, aa); + LDLM_LOCK_PUT(lock); + return rc; +} + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is excluded from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, + struct obd_enqueue_info *einfo) +{ + struct ldlm_res_id res_id = { .name = {oinfo->oi_md->lsm_object_id} }; struct obd_device *obd = exp->exp_obd; - struct ost_lvb lvb; struct ldlm_reply *rep; struct ptlrpc_request *req = NULL; + int intent = einfo->ei_flags & LDLM_FL_HAS_INTENT; int rc; ENTRY; /* Filesystem lock extents are extended to page boundaries so that * dealing with the page cache is a little smoother. */ - policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; - policy->l_extent.end |= ~CFS_PAGE_MASK; + oinfo->oi_policy.l_extent.start -= + oinfo->oi_policy.l_extent.start & ~CFS_PAGE_MASK; + oinfo->oi_policy.l_extent.end |= ~CFS_PAGE_MASK; - if (lsm->lsm_oinfo->loi_kms_valid == 0) + if (oinfo->oi_md->lsm_oinfo->loi_kms_valid == 0) goto no_match; /* Next, search for already existing extent locks that will cover us */ - rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy, - mode, lockh); + rc = ldlm_lock_match(obd->obd_namespace, einfo->ei_flags, &res_id, + einfo->ei_type, &oinfo->oi_policy, einfo->ei_mode, + oinfo->oi_lockh); if (rc == 1) { - osc_set_data_with_check(lockh, data, *flags); - if (*flags & LDLM_FL_HAS_INTENT) { + osc_set_data_with_check(oinfo->oi_lockh, einfo->ei_cbdata, + einfo->ei_flags); + if (intent) { /* I would like to be able to ASSERT here that rss <= * kms, but I can't, for reasons which are explained in * lov_enqueue() */ } + + /* For async requests, decref the lock. */ + if (einfo->ei_rqset) + ldlm_lock_decref(oinfo->oi_lockh, einfo->ei_mode); + /* We already have a lock, and it's referenced */ + oinfo->oi_cb_up(oinfo, ELDLM_OK); RETURN(ELDLM_OK); } @@ -2759,22 +2886,28 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, * send us a blocking callback, but there are problems with canceling * locks out from other users right now, too. */ - if (mode == LCK_PR) { - rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, - policy, LCK_PW, lockh); + if (einfo->ei_mode == LCK_PR) { + rc = ldlm_lock_match(obd->obd_namespace, einfo->ei_flags, + &res_id, einfo->ei_type, &oinfo->oi_policy, + LCK_PW, oinfo->oi_lockh); if (rc == 1) { /* FIXME: This is not incredibly elegant, but it might * be more elegant than adding another parameter to * lock_match. I want a second opinion. */ - ldlm_lock_addref(lockh, LCK_PR); - ldlm_lock_decref(lockh, LCK_PW); - osc_set_data_with_check(lockh, data, *flags); + /* addref the lock only if not async requests. */ + if (!einfo->ei_rqset) + ldlm_lock_addref(oinfo->oi_lockh, LCK_PR); + osc_set_data_with_check(oinfo->oi_lockh, + einfo->ei_cbdata, + einfo->ei_flags); + ldlm_lock_decref(oinfo->oi_lockh, LCK_PW); + oinfo->oi_cb_up(oinfo, ELDLM_OK); RETURN(ELDLM_OK); } } no_match: - if (*flags & LDLM_FL_HAS_INTENT) { + if (intent) { int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request) }; @@ -2785,35 +2918,43 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, RETURN(-ENOMEM); size[DLM_LOCKREPLY_OFF] = sizeof(*rep); - size[DLM_REPLY_REC_OFF] = sizeof(lvb); + size[DLM_REPLY_REC_OFF] = + sizeof(oinfo->oi_md->lsm_oinfo->loi_lvb); ptlrpc_req_set_repsize(req, 3, size); } /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ - *flags &= ~LDLM_FL_BLOCK_GRANTED; - - rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type, - policy, mode, flags, bl_cb, cp_cb, gl_cb, data, - &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh); - - if (req != NULL) { - if (rc == ELDLM_LOCK_ABORTED) { - /* swabbed by ldlm_cli_enqueue() */ - LASSERT_REPSWABBED(req, DLM_LOCKREPLY_OFF); - rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, - sizeof(*rep)); - LASSERT(rep != NULL); - if (rep->lock_policy_res1) - rc = rep->lock_policy_res1; + einfo->ei_flags &= ~LDLM_FL_BLOCK_GRANTED; + + rc = ldlm_cli_enqueue(exp, &req, res_id, einfo->ei_type, + &oinfo->oi_policy, einfo->ei_mode, + &einfo->ei_flags, einfo->ei_cb_bl, + einfo->ei_cb_cp, einfo->ei_cb_gl, + einfo->ei_cbdata, + &oinfo->oi_md->lsm_oinfo->loi_lvb, + sizeof(oinfo->oi_md->lsm_oinfo->loi_lvb), + lustre_swab_ost_lvb, oinfo->oi_lockh, + einfo->ei_rqset ? 1 : 0); + if (einfo->ei_rqset) { + if (!rc) { + struct osc_enqueue_args *aa; + LASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = (struct osc_enqueue_args *)&req->rq_async_args; + aa->oa_oi = oinfo; + aa->oa_ei = einfo; + aa->oa_exp = exp; + + req->rq_interpret_reply = osc_enqueue_interpret; + ptlrpc_set_add_req(einfo->ei_rqset, req); + } else if (intent) { + ptlrpc_req_finished(req); } - ptlrpc_req_finished(req); + RETURN(rc); } - if ((*flags & LDLM_FL_HAS_INTENT && rc == ELDLM_LOCK_ABORTED) || !rc) { - CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n", - lvb.lvb_size, lvb.lvb_blocks, lvb.lvb_mtime); - lsm->lsm_oinfo->loi_lvb = lvb; - } + rc = osc_enqueue_fini(req, oinfo, intent, rc); + if (intent) + ptlrpc_req_finished(req); RETURN(rc); } @@ -2892,6 +3033,60 @@ static int osc_join_lru(struct obd_export *exp, return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join); } +static int osc_statfs_interpret(struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct obd_statfs *msfs; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + msfs = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*msfs), + lustre_swab_obd_statfs); + if (msfs == NULL) { + CERROR("Can't unpack obd_statfs\n"); + GOTO(out, rc = -EPROTO); + } + + memcpy(aa->aa_oi->oi_osfs, msfs, sizeof(*msfs)); +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo, + unsigned long max_age, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_async_args *aa; + int size[2] = { sizeof(struct ptlrpc_body), sizeof(*oinfo->oi_osfs) }; + ENTRY; + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OST_VERSION, + OST_STATFS, 1, NULL, NULL); + if (!req) + RETURN(-ENOMEM); + + ptlrpc_req_set_repsize(req, 2, size); + req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + + req->rq_interpret_reply = osc_statfs_interpret; + LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); + aa = (struct osc_async_args *)&req->rq_async_args; + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + RETURN(0); +} + static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, cfs_time_t max_age) { @@ -3478,6 +3673,7 @@ struct obd_ops osc_obd_ops = { .o_reconnect = osc_reconnect, .o_disconnect = osc_disconnect, .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, .o_packmd = osc_packmd, .o_unpackmd = osc_unpackmd, .o_create = osc_create, @@ -3522,6 +3718,7 @@ struct obd_ops sanosc_obd_ops = { .o_reconnect = osc_reconnect, .o_disconnect = client_disconnect_export, .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, .o_packmd = osc_packmd, .o_unpackmd = osc_unpackmd, .o_create = osc_real_create, @@ -3529,6 +3726,7 @@ struct obd_ops sanosc_obd_ops = { .o_getattr = osc_getattr, .o_getattr_async = osc_getattr_async, .o_setattr = osc_setattr, + .o_setattr_async = osc_setattr_async, .o_brw = sanosc_brw, .o_punch = osc_punch, .o_sync = osc_sync, diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 49cf171..221a7ae 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -104,6 +104,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req, static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req) { struct ost_body *body, *repbody; + struct obd_info oinfo = { { { 0 } } }; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; ENTRY; @@ -119,7 +120,9 @@ static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req) repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_getattr(exp, &repbody->oa, NULL); + + oinfo.oi_oa = &repbody->oa; + req->rq_status = obd_getattr(exp, &oinfo); RETURN(0); } @@ -212,11 +215,10 @@ static int ost_punch_lock_get(struct obd_export *exp, struct obdo *oa, else policy.l_extent.end = finis | ~CFS_PAGE_MASK; - RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace, - res_id, LDLM_EXTENT, &policy, LCK_PW, &flags, - ldlm_blocking_ast, ldlm_completion_ast, - ldlm_glimpse_ast, - NULL, NULL, 0, NULL, lh)); + RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, res_id, + LDLM_EXTENT, &policy, LCK_PW, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + ldlm_glimpse_ast, NULL, 0, NULL, lh)); } /* @@ -235,7 +237,7 @@ static void ost_punch_lock_put(struct obd_export *exp, struct obdo *oa, static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, struct obd_trans_info *oti) { - struct obdo *oa; + struct obd_info oinfo = { { { 0 } } }; struct ost_body *body, *repbody; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) }; struct lustre_handle lh = {0,}; @@ -249,8 +251,11 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, if (body == NULL) RETURN(-EFAULT); - oa = &body->oa; - if ((oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != + oinfo.oi_oa = &body->oa; + oinfo.oi_policy.l_extent.start = oinfo.oi_oa->o_size; + oinfo.oi_policy.l_extent.end = oinfo.oi_oa->o_blocks; + + if ((oinfo.oi_oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) RETURN(-EINVAL); @@ -260,21 +265,20 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repbody)); - repbody->oa = *oa; - rc = ost_punch_lock_get(exp, oa, &lh); + repbody->oa = *oinfo.oi_oa; + rc = ost_punch_lock_get(exp, oinfo.oi_oa, &lh); if (rc == 0) { - if (oa->o_valid & OBD_MD_FLFLAGS && - oa->o_flags == OBD_FL_TRUNCLOCK) + if (oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS && + oinfo.oi_oa->o_flags == OBD_FL_TRUNCLOCK) /* * If OBD_FL_TRUNCLOCK is the only bit set in * ->o_flags, clear OBD_MD_FLFLAGS to avoid falling * through filter_setattr() to filter_iocontrol(). */ - oa->o_valid &= ~OBD_MD_FLFLAGS; + oinfo.oi_oa->o_valid &= ~OBD_MD_FLFLAGS; - req->rq_status = obd_punch(exp, oa, NULL, - oa->o_size, oa->o_blocks, oti); - ost_punch_lock_put(exp, oa, &lh); + req->rq_status = obd_punch(exp, &oinfo, oti, NULL); + ost_punch_lock_put(exp, oinfo.oi_oa, &lh); } RETURN(rc); } @@ -307,6 +311,7 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req, { struct ost_body *body, *repbody; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) }; + struct obd_info oinfo = { { { 0 } } }; ENTRY; body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), @@ -322,7 +327,8 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_setattr(exp, &repbody->oa, NULL, oti); + oinfo.oi_oa = &repbody->oa; + req->rq_status = obd_setattr(exp, &oinfo, oti); RETURN(0); } @@ -568,11 +574,10 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp, policy.l_extent.end = (nb[nrbufs - 1].offset + nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK; - RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace, - res_id, LDLM_EXTENT, &policy, mode, &flags, - ldlm_blocking_ast, ldlm_completion_ast, - ldlm_glimpse_ast, - NULL, NULL, 0, NULL, lh)); + RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, res_id, + LDLM_EXTENT, &policy, mode, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + ldlm_glimpse_ast, NULL, 0, NULL, lh)); } static void ost_brw_lock_put(int mode, diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench index fe80594..9c5371b 100755 --- a/lustre/tests/rundbench +++ b/lustre/tests/rundbench @@ -1,6 +1,6 @@ #!/bin/sh -MNT=${MNT:-/mnt/lustre} -DIR=${DIR:-$MNT/`hostname`} +MOUNT=${MOUNT:-/mnt/lustre} +DIR=${DIR:-$MOUNT/`hostname`} #[ -e /proc/sys/lnet/debug ] && echo 0 > /proc/sys/lnet/debug mkdir -p $DIR TGT=$DIR/client.txt diff --git a/lustre/tests/runvmstat b/lustre/tests/runvmstat index d8e887a..8a5a7df 100755 --- a/lustre/tests/runvmstat +++ b/lustre/tests/runvmstat @@ -1,17 +1,23 @@ #!/bin/sh +DELAY=1 null() { : } -if [ "$1" == "-q" ]; then - echo "echo off" +case "$1" in +-q) echo "echo off" ECHO="null" - shift -else + shift ;; +[1-9]*) + DELAY=$1 + ECHO=echo + shift ;; +*) echo "echo on" ECHO=echo -fi -vmstat 1 | while read LINE ; do +esac + +vmstat $DELAY | while read LINE ; do LINE="`date +%s`: $LINE" $ECHO "$LINE" [ "$1" ] && echo "$LINE" >> $1 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 8b43c31..23f96f1 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -86,9 +86,11 @@ CLEANUP=${CLEANUP:-:} setup() { echo -n "mnt.." + load_modules setupall || exit 10 echo "done" } + SETUP=${SETUP:-:} log() { @@ -2747,11 +2749,11 @@ test_76() { # bug 1443 done AFTER_INODES=`num_inodes` echo "after inodes: $AFTER_INODES" - [ $AFTER_INODES -gt $((BEFORE_INODES + 10)) ] && \ + [ $AFTER_INODES -gt $((BEFORE_INODES + 32)) ] && \ error "inode slab grew from $BEFORE_INODES to $AFTER_INODES" true } -run_test 76 "destroy duplicate inodes in client inode cache" +run_test 76 "destroy duplicate inodes in client inode cache ====" test_77() { sh qos.sh diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index dc5c76f..9b62676 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -7,6 +7,7 @@ set -e export REFORMAT="" export VERBOSE=false export GMNALNID=${GMNALNID:-/usr/sbin/gmlndnid} +export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe} # eg, assert_env LUSTRE MDSNODES OSTNODES CLIENTS assert_env() { @@ -224,6 +225,7 @@ reboot_facet() { # verify that lustre actually cleaned up properly cleanup_check() { + [ -e $CATASTROPHE -a "`cat $CATASTROPHE`" = "1" ] && echo "LBUG" && exit 206 BUSY=`dmesg | grep -i destruct || true` if [ "$BUSY" ]; then echo "$BUSY" 1>&2 @@ -789,6 +791,8 @@ run_one() { #check_mds test_${testnum} || error "test_$testnum failed with $?" #check_mds + [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ + error "LBUG/LASSERT detected" pass "($((`date +%s` - $BEFORE))s)" } diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 0a8423e..697780f 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -33,13 +33,24 @@ lload_SOURCES = lload.c lload_LDADD := $(LIBREADLINE) $(LIBPTLCTL) lload_DEPENDENCIES := $(LIBPTLCTL) -llverfs_LDADD := -lext2fs -le2p +if EXT2FS_DEVEL +EXT2FSLIB = -lext2fs +E2PLIB = -le2p +else +E2PLIB = +EXT2FSLIB = +endif + if BLKID -llverdev_LDADD := -lext2fs -lblkid +BLKIDLIB = -lblkid else -llverdev_LDADD := -lext2fs +BLKIDLIB = endif +llverfs_LDADD := $(EXT2FSLIB) $(E2PLIB) + +llverdev_LDADD := $(EXT2FSLIB) $(BLKIDLIB) + liblustreapi_a_SOURCES = liblustreapi.c wirecheck_SOURCES = wirecheck.c diff --git a/lustre/utils/llverdev.c b/lustre/utils/llverdev.c index 3494a04..79d6d39 100644 --- a/lustre/utils/llverdev.c +++ b/lustre/utils/llverdev.c @@ -47,7 +47,10 @@ #include #include #include -#include + +#ifdef HAVE_EXT2FS_EXT2FS_H +# include +#endif #define ONE_MB (1024 * 1024) #define ONE_GB (1024 * 1024 * 1024) @@ -118,6 +121,7 @@ void usage(int status) */ static int open_dev(const char *devname, int mode) { +#ifdef HAVE_EXT2FS_EXT2FS_H int mount_flags; char mountpt[80] = ""; @@ -132,6 +136,7 @@ static int open_dev(const char *devname, int mode) devname); exit(1); } +#endif fd = open(devname, mode | O_EXCL | O_LARGEFILE); if (fd < 0) { fprintf(stderr, "%s: Open failed: %s",progname,strerror(errno)); diff --git a/lustre/utils/llverfs.c b/lustre/utils/llverfs.c index c25fa78..b98093c 100644 --- a/lustre/utils/llverfs.c +++ b/lustre/utils/llverfs.c @@ -49,9 +49,12 @@ #include #include #include -#include #include -#include + +#ifdef HAVE_EXT2FS_EXT2FS_H +# include +# include +#endif #define ONE_MB (1024 * 1024) #define ONE_GB ((unsigned long long)(1024 * 1024 * 1024)) @@ -334,11 +337,12 @@ static int dir_write(char *chunk_buf, size_t chunksize, int file_num = 999999999; ino_t inode_st = 0; +#ifdef HAVE_EXT2FS_EXT2FS_H if (!full && fsetflags(testdir, EXT2_TOPDIR_FL)) fprintf(stderr, "\n%s: can't set TOPDIR_FL on %s: %s (ignoring)", progname, testdir, strerror(errno)); - +#endif for (; dir_num < num_dirs; num_files++, file_num++) { if (file_num >= files_in_dir) { if (dir_num == num_dirs - 1) @@ -520,6 +524,7 @@ int main(int argc, char **argv) isatty_flag = isatty(STDOUT_FILENO); if (!full) { +#ifdef HAVE_EXT2FS_EXT2FS_H struct mntent *tempmnt; FILE *fp = NULL; ext2_filsys fs; @@ -561,6 +566,9 @@ int main(int argc, char **argv) num_dirs, fs->super->s_blocks_count, fs->super->s_blocks_per_group); ext2fs_close(fs); +#else + goto guess; +#endif if (0) { /* ugh */ struct statfs64 statbuf; guess: -- 1.8.3.1